1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const MAX_IOV: usize = 1024;
8
9const STREAM_BUF: usize = 8 * 1024 * 1024;
13
14const PARALLEL_THRESHOLD: usize = 64 * 1024 * 1024;
20
21#[cfg(target_arch = "x86_64")]
27static COMPACT_LUT: [[u8; 8]; 256] = {
28 let mut lut = [[0u8; 8]; 256];
29 let mut mask: u16 = 0;
30 while mask < 256 {
31 let mut idx: usize = 0;
32 let mut bit: u8 = 0;
33 while bit < 8 {
34 if (mask >> bit) & 1 != 0 {
35 lut[mask as usize][idx] = bit;
36 idx += 1;
37 }
38 bit += 1;
39 }
40 mask += 1;
41 }
42 lut
43};
44
45#[inline]
48fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
49 if slices.is_empty() {
50 return Ok(());
51 }
52 for batch in slices.chunks(MAX_IOV) {
53 let total: usize = batch.iter().map(|s| s.len()).sum();
54 match writer.write_vectored(batch) {
55 Ok(n) if n >= total => continue,
56 Ok(mut written) => {
57 for slice in batch {
59 let slen = slice.len();
60 if written >= slen {
61 written -= slen;
62 continue;
63 }
64 if written > 0 {
65 writer.write_all(&slice[written..])?;
66 written = 0;
67 } else {
68 writer.write_all(slice)?;
69 }
70 }
71 }
72 Err(e) => return Err(e),
73 }
74 }
75 Ok(())
76}
77
78#[inline]
82#[allow(clippy::uninit_vec)]
83fn alloc_uninit_vec(len: usize) -> Vec<u8> {
84 let mut v = Vec::with_capacity(len);
85 unsafe {
87 v.set_len(len);
88 }
89 #[cfg(target_os = "linux")]
90 if len >= 2 * 1024 * 1024 {
91 unsafe {
92 libc::madvise(
93 v.as_mut_ptr() as *mut libc::c_void,
94 len,
95 libc::MADV_HUGEPAGE,
96 );
97 }
98 }
99 v
100}
101
102#[inline]
104fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
105 let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
106 let last = set2.last().copied();
107 for (i, &from) in set1.iter().enumerate() {
108 table[from as usize] = if i < set2.len() {
109 set2[i]
110 } else {
111 last.unwrap_or(from)
112 };
113 }
114 table
115}
116
117#[inline]
119fn build_member_set(chars: &[u8]) -> [u8; 32] {
120 let mut set = [0u8; 32];
121 for &ch in chars {
122 set[ch as usize >> 3] |= 1 << (ch & 7);
123 }
124 set
125}
126
127#[inline(always)]
128fn is_member(set: &[u8; 32], ch: u8) -> bool {
129 unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
130}
131
132#[cfg(target_arch = "x86_64")]
135static SIMD_LEVEL: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
136
137#[cfg(target_arch = "x86_64")]
138#[inline(always)]
139fn get_simd_level() -> u8 {
140 let level = SIMD_LEVEL.load(std::sync::atomic::Ordering::Relaxed);
141 if level != 0 {
142 return level;
143 }
144 let detected = if is_x86_feature_detected!("avx2") {
145 3
146 } else if is_x86_feature_detected!("ssse3") {
147 2
148 } else {
149 1
150 };
151 SIMD_LEVEL.store(detected, std::sync::atomic::Ordering::Relaxed);
152 detected
153}
154
155#[cfg(target_arch = "x86_64")]
157#[inline]
158fn count_non_identity(table: &[u8; 256]) -> usize {
159 table
160 .iter()
161 .enumerate()
162 .filter(|&(i, &v)| v != i as u8)
163 .count()
164}
165
166#[inline(always)]
172fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
173 #[cfg(target_arch = "x86_64")]
174 {
175 let level = get_simd_level();
176 if level >= 3 {
177 let non_id = count_non_identity(table);
182 if non_id > 0 && non_id <= 16 {
183 unsafe { translate_inplace_avx2_sparse(data, table) };
184 return;
185 }
186 unsafe { translate_inplace_avx2_table(data, table) };
187 return;
188 }
189 if level >= 2 {
190 unsafe { translate_inplace_ssse3_table(data, table) };
191 return;
192 }
193 }
194 translate_inplace_scalar(data, table);
195}
196
197#[cfg(target_arch = "x86_64")]
203#[target_feature(enable = "avx2")]
204unsafe fn translate_inplace_avx2_sparse(data: &mut [u8], table: &[u8; 256]) {
205 use std::arch::x86_64::*;
206
207 unsafe {
208 let len = data.len();
209 let ptr = data.as_mut_ptr();
210
211 let mut lut = [_mm256_setzero_si256(); 16];
213 for h in 0u8..16 {
214 let base = (h as usize) * 16;
215 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
216 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
217 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
218 }
219
220 let lo_mask = _mm256_set1_epi8(0x0F);
221
222 let mut i = 0;
223 while i + 32 <= len {
224 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
225 let lo_nibble = _mm256_and_si256(input, lo_mask);
226 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
227
228 let mut result = _mm256_setzero_si256();
229 macro_rules! do_nibble {
230 ($h:expr) => {
231 let h_val = _mm256_set1_epi8($h as i8);
232 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
233 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
234 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
235 };
236 }
237 do_nibble!(0);
238 do_nibble!(1);
239 do_nibble!(2);
240 do_nibble!(3);
241 do_nibble!(4);
242 do_nibble!(5);
243 do_nibble!(6);
244 do_nibble!(7);
245 do_nibble!(8);
246 do_nibble!(9);
247 do_nibble!(10);
248 do_nibble!(11);
249 do_nibble!(12);
250 do_nibble!(13);
251 do_nibble!(14);
252 do_nibble!(15);
253
254 let diff = _mm256_xor_si256(input, result);
256 if _mm256_testz_si256(diff, diff) == 0 {
257 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
258 }
259 i += 32;
260 }
261
262 while i < len {
264 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
265 i += 1;
266 }
267 }
268}
269
270#[cfg(not(target_arch = "aarch64"))]
272#[inline(always)]
273fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
274 let len = data.len();
275 let ptr = data.as_mut_ptr();
276 let mut i = 0;
277 unsafe {
278 while i + 8 <= len {
279 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
280 *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
281 *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
282 *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
283 *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
284 *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
285 *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
286 *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
287 i += 8;
288 }
289 while i < len {
290 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
291 i += 1;
292 }
293 }
294}
295
296#[cfg(target_arch = "aarch64")]
299#[inline(always)]
300fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
301 unsafe { translate_inplace_neon_table(data, table) };
302}
303
304#[cfg(target_arch = "aarch64")]
305#[target_feature(enable = "neon")]
306unsafe fn translate_inplace_neon_table(data: &mut [u8], table: &[u8; 256]) {
307 use std::arch::aarch64::*;
308
309 unsafe {
310 let len = data.len();
311 let ptr = data.as_mut_ptr();
312
313 let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
315 for h in 0u8..16 {
316 let base = (h as usize) * 16;
317 lut[h as usize] = vld1q_u8(table.as_ptr().add(base));
318 }
319
320 let lo_mask = vdupq_n_u8(0x0F);
321 let mut i = 0;
322
323 while i + 16 <= len {
324 let input = vld1q_u8(ptr.add(i));
325 let lo_nibble = vandq_u8(input, lo_mask);
326 let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
327
328 let mut result = vdupq_n_u8(0);
329 macro_rules! do_nibble {
330 ($h:expr) => {
331 let h_val = vdupq_n_u8($h);
332 let mask = vceqq_u8(hi_nibble, h_val);
333 let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
334 result = vorrq_u8(result, vandq_u8(mask, looked_up));
335 };
336 }
337 do_nibble!(0);
338 do_nibble!(1);
339 do_nibble!(2);
340 do_nibble!(3);
341 do_nibble!(4);
342 do_nibble!(5);
343 do_nibble!(6);
344 do_nibble!(7);
345 do_nibble!(8);
346 do_nibble!(9);
347 do_nibble!(10);
348 do_nibble!(11);
349 do_nibble!(12);
350 do_nibble!(13);
351 do_nibble!(14);
352 do_nibble!(15);
353
354 vst1q_u8(ptr.add(i), result);
355 i += 16;
356 }
357
358 while i < len {
360 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
361 i += 1;
362 }
363 }
364}
365
366#[cfg(target_arch = "x86_64")]
386#[target_feature(enable = "avx2")]
387unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
388 use std::arch::x86_64::*;
389
390 unsafe {
391 let len = data.len();
392 let ptr = data.as_mut_ptr();
393
394 let mut lut = [_mm256_setzero_si256(); 16];
398 for h in 0u8..16 {
399 let base = (h as usize) * 16;
400 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
401 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
403 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
404 }
405
406 let lo_mask = _mm256_set1_epi8(0x0F);
407
408 let mut i = 0;
409
410 while i + 64 <= len {
414 let input0 = _mm256_loadu_si256(ptr.add(i) as *const _);
415 let input1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
416
417 let lo0 = _mm256_and_si256(input0, lo_mask);
418 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
419 let lo1 = _mm256_and_si256(input1, lo_mask);
420 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
421
422 let mut r0 = _mm256_setzero_si256();
423 let mut r1 = _mm256_setzero_si256();
424
425 macro_rules! do_nibble2 {
426 ($h:expr) => {
427 let h_val = _mm256_set1_epi8($h as i8);
428 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
429 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
430 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
431 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
432 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
433 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
434 };
435 }
436 do_nibble2!(0);
437 do_nibble2!(1);
438 do_nibble2!(2);
439 do_nibble2!(3);
440 do_nibble2!(4);
441 do_nibble2!(5);
442 do_nibble2!(6);
443 do_nibble2!(7);
444 do_nibble2!(8);
445 do_nibble2!(9);
446 do_nibble2!(10);
447 do_nibble2!(11);
448 do_nibble2!(12);
449 do_nibble2!(13);
450 do_nibble2!(14);
451 do_nibble2!(15);
452
453 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
454 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
455 i += 64;
456 }
457
458 if i + 32 <= len {
460 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
461 let lo_nibble = _mm256_and_si256(input, lo_mask);
462 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
463
464 let mut result = _mm256_setzero_si256();
465
466 macro_rules! do_nibble {
467 ($h:expr) => {
468 let h_val = _mm256_set1_epi8($h as i8);
469 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
470 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
471 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
472 };
473 }
474 do_nibble!(0);
475 do_nibble!(1);
476 do_nibble!(2);
477 do_nibble!(3);
478 do_nibble!(4);
479 do_nibble!(5);
480 do_nibble!(6);
481 do_nibble!(7);
482 do_nibble!(8);
483 do_nibble!(9);
484 do_nibble!(10);
485 do_nibble!(11);
486 do_nibble!(12);
487 do_nibble!(13);
488 do_nibble!(14);
489 do_nibble!(15);
490
491 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
492 i += 32;
493 }
494
495 if i + 16 <= len {
497 let lo_mask128 = _mm_set1_epi8(0x0F);
498
499 let mut lut128 = [_mm_setzero_si128(); 16];
500 for h in 0u8..16 {
501 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
502 }
503
504 let input = _mm_loadu_si128(ptr.add(i) as *const _);
505 let lo_nib = _mm_and_si128(input, lo_mask128);
506 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
507
508 let mut res = _mm_setzero_si128();
509 macro_rules! do_nibble128 {
510 ($h:expr) => {
511 let h_val = _mm_set1_epi8($h as i8);
512 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
513 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
514 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
515 };
516 }
517 do_nibble128!(0);
518 do_nibble128!(1);
519 do_nibble128!(2);
520 do_nibble128!(3);
521 do_nibble128!(4);
522 do_nibble128!(5);
523 do_nibble128!(6);
524 do_nibble128!(7);
525 do_nibble128!(8);
526 do_nibble128!(9);
527 do_nibble128!(10);
528 do_nibble128!(11);
529 do_nibble128!(12);
530 do_nibble128!(13);
531 do_nibble128!(14);
532 do_nibble128!(15);
533
534 _mm_storeu_si128(ptr.add(i) as *mut _, res);
535 i += 16;
536 }
537
538 while i < len {
540 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
541 i += 1;
542 }
543 }
544}
545
546#[cfg(target_arch = "x86_64")]
547#[target_feature(enable = "ssse3")]
548unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
549 use std::arch::x86_64::*;
550
551 unsafe {
552 let len = data.len();
553 let ptr = data.as_mut_ptr();
554
555 let mut lut = [_mm_setzero_si128(); 16];
557 for h in 0u8..16 {
558 let base = (h as usize) * 16;
559 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
560 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
561 }
562
563 let lo_mask = _mm_set1_epi8(0x0F);
564
565 let mut i = 0;
566 while i + 16 <= len {
567 let input = _mm_loadu_si128(ptr.add(i) as *const _);
568 let lo_nibble = _mm_and_si128(input, lo_mask);
569 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
570
571 let mut result = _mm_setzero_si128();
572
573 macro_rules! do_nibble {
574 ($h:expr) => {
575 let h_val = _mm_set1_epi8($h as i8);
576 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
577 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
578 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
579 };
580 }
581 do_nibble!(0);
582 do_nibble!(1);
583 do_nibble!(2);
584 do_nibble!(3);
585 do_nibble!(4);
586 do_nibble!(5);
587 do_nibble!(6);
588 do_nibble!(7);
589 do_nibble!(8);
590 do_nibble!(9);
591 do_nibble!(10);
592 do_nibble!(11);
593 do_nibble!(12);
594 do_nibble!(13);
595 do_nibble!(14);
596 do_nibble!(15);
597
598 _mm_storeu_si128(ptr.add(i) as *mut _, result);
599 i += 16;
600 }
601
602 while i < len {
604 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
605 i += 1;
606 }
607 }
608}
609
610#[inline(always)]
613fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
614 debug_assert!(dst.len() >= src.len());
615 #[cfg(target_arch = "x86_64")]
616 {
617 let level = get_simd_level();
618 if level >= 3 {
619 if dst.as_ptr() as usize & 31 == 0 {
621 unsafe { translate_to_avx2_table_nt(src, dst, table) };
622 } else {
623 unsafe { translate_to_avx2_table(src, dst, table) };
624 }
625 return;
626 }
627 if level >= 2 {
628 unsafe { translate_to_ssse3_table(src, dst, table) };
629 return;
630 }
631 }
632 translate_to_scalar(src, dst, table);
633}
634
635#[cfg(not(target_arch = "aarch64"))]
637#[inline(always)]
638fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
639 unsafe {
640 let sp = src.as_ptr();
641 let dp = dst.as_mut_ptr();
642 let len = src.len();
643 let mut i = 0;
644 while i + 8 <= len {
645 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
646 *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
647 *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
648 *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
649 *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
650 *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
651 *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
652 *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
653 i += 8;
654 }
655 while i < len {
656 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
657 i += 1;
658 }
659 }
660}
661
662#[cfg(target_arch = "aarch64")]
664#[inline(always)]
665fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
666 unsafe { translate_to_neon_table(src, dst, table) };
667}
668
669#[cfg(target_arch = "aarch64")]
670#[target_feature(enable = "neon")]
671unsafe fn translate_to_neon_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
672 use std::arch::aarch64::*;
673
674 unsafe {
675 let len = src.len();
676 let sp = src.as_ptr();
677 let dp = dst.as_mut_ptr();
678
679 let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
680 for h in 0u8..16 {
681 lut[h as usize] = vld1q_u8(table.as_ptr().add((h as usize) * 16));
682 }
683
684 let lo_mask = vdupq_n_u8(0x0F);
685 let mut i = 0;
686
687 while i + 16 <= len {
688 let input = vld1q_u8(sp.add(i));
689 let lo_nibble = vandq_u8(input, lo_mask);
690 let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
691
692 let mut result = vdupq_n_u8(0);
693 macro_rules! do_nibble {
694 ($h:expr) => {
695 let h_val = vdupq_n_u8($h);
696 let mask = vceqq_u8(hi_nibble, h_val);
697 let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
698 result = vorrq_u8(result, vandq_u8(mask, looked_up));
699 };
700 }
701 do_nibble!(0);
702 do_nibble!(1);
703 do_nibble!(2);
704 do_nibble!(3);
705 do_nibble!(4);
706 do_nibble!(5);
707 do_nibble!(6);
708 do_nibble!(7);
709 do_nibble!(8);
710 do_nibble!(9);
711 do_nibble!(10);
712 do_nibble!(11);
713 do_nibble!(12);
714 do_nibble!(13);
715 do_nibble!(14);
716 do_nibble!(15);
717
718 vst1q_u8(dp.add(i), result);
719 i += 16;
720 }
721
722 while i < len {
723 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
724 i += 1;
725 }
726 }
727}
728
729#[cfg(target_arch = "x86_64")]
730#[target_feature(enable = "avx2")]
731unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
732 use std::arch::x86_64::*;
733
734 unsafe {
735 let len = src.len();
736 let sp = src.as_ptr();
737 let dp = dst.as_mut_ptr();
738
739 let mut lut = [_mm256_setzero_si256(); 16];
741 for h in 0u8..16 {
742 let base = (h as usize) * 16;
743 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
744 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
745 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
746 }
747
748 let lo_mask = _mm256_set1_epi8(0x0F);
749
750 let mut i = 0;
751
752 while i + 64 <= len {
754 let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
755 let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
756
757 let lo0 = _mm256_and_si256(input0, lo_mask);
758 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
759 let lo1 = _mm256_and_si256(input1, lo_mask);
760 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
761
762 let mut r0 = _mm256_setzero_si256();
763 let mut r1 = _mm256_setzero_si256();
764
765 macro_rules! do_nibble2 {
766 ($h:expr) => {
767 let h_val = _mm256_set1_epi8($h as i8);
768 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
769 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
770 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
771 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
772 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
773 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
774 };
775 }
776 do_nibble2!(0);
777 do_nibble2!(1);
778 do_nibble2!(2);
779 do_nibble2!(3);
780 do_nibble2!(4);
781 do_nibble2!(5);
782 do_nibble2!(6);
783 do_nibble2!(7);
784 do_nibble2!(8);
785 do_nibble2!(9);
786 do_nibble2!(10);
787 do_nibble2!(11);
788 do_nibble2!(12);
789 do_nibble2!(13);
790 do_nibble2!(14);
791 do_nibble2!(15);
792
793 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
794 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
795 i += 64;
796 }
797
798 if i + 32 <= len {
800 let input = _mm256_loadu_si256(sp.add(i) as *const _);
801 let lo_nibble = _mm256_and_si256(input, lo_mask);
802 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
803
804 let mut result = _mm256_setzero_si256();
805
806 macro_rules! do_nibble {
807 ($h:expr) => {
808 let h_val = _mm256_set1_epi8($h as i8);
809 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
810 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
811 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
812 };
813 }
814 do_nibble!(0);
815 do_nibble!(1);
816 do_nibble!(2);
817 do_nibble!(3);
818 do_nibble!(4);
819 do_nibble!(5);
820 do_nibble!(6);
821 do_nibble!(7);
822 do_nibble!(8);
823 do_nibble!(9);
824 do_nibble!(10);
825 do_nibble!(11);
826 do_nibble!(12);
827 do_nibble!(13);
828 do_nibble!(14);
829 do_nibble!(15);
830
831 _mm256_storeu_si256(dp.add(i) as *mut _, result);
832 i += 32;
833 }
834
835 if i + 16 <= len {
837 let lo_mask128 = _mm_set1_epi8(0x0F);
838 let mut lut128 = [_mm_setzero_si128(); 16];
839 for h in 0u8..16 {
840 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
841 }
842
843 let input = _mm_loadu_si128(sp.add(i) as *const _);
844 let lo_nib = _mm_and_si128(input, lo_mask128);
845 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
846
847 let mut res = _mm_setzero_si128();
848 macro_rules! do_nibble128 {
849 ($h:expr) => {
850 let h_val = _mm_set1_epi8($h as i8);
851 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
852 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
853 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
854 };
855 }
856 do_nibble128!(0);
857 do_nibble128!(1);
858 do_nibble128!(2);
859 do_nibble128!(3);
860 do_nibble128!(4);
861 do_nibble128!(5);
862 do_nibble128!(6);
863 do_nibble128!(7);
864 do_nibble128!(8);
865 do_nibble128!(9);
866 do_nibble128!(10);
867 do_nibble128!(11);
868 do_nibble128!(12);
869 do_nibble128!(13);
870 do_nibble128!(14);
871 do_nibble128!(15);
872
873 _mm_storeu_si128(dp.add(i) as *mut _, res);
874 i += 16;
875 }
876
877 while i < len {
879 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
880 i += 1;
881 }
882 }
883}
884
885#[cfg(target_arch = "x86_64")]
888#[target_feature(enable = "avx2")]
889unsafe fn translate_to_avx2_table_nt(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
890 use std::arch::x86_64::*;
891
892 unsafe {
893 let len = src.len();
894 let sp = src.as_ptr();
895 let dp = dst.as_mut_ptr();
896
897 let mut lut = [_mm256_setzero_si256(); 16];
899 for h in 0u8..16 {
900 let base = (h as usize) * 16;
901 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
902 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
903 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
904 }
905
906 let lo_mask = _mm256_set1_epi8(0x0F);
907 let mut i = 0;
908
909 while i + 64 <= len {
911 let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
912 let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
913
914 let lo0 = _mm256_and_si256(input0, lo_mask);
915 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
916 let lo1 = _mm256_and_si256(input1, lo_mask);
917 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
918
919 let mut r0 = _mm256_setzero_si256();
920 let mut r1 = _mm256_setzero_si256();
921
922 macro_rules! do_nibble2 {
923 ($h:expr) => {
924 let h_val = _mm256_set1_epi8($h as i8);
925 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
926 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
927 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
928 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
929 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
930 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
931 };
932 }
933 do_nibble2!(0);
934 do_nibble2!(1);
935 do_nibble2!(2);
936 do_nibble2!(3);
937 do_nibble2!(4);
938 do_nibble2!(5);
939 do_nibble2!(6);
940 do_nibble2!(7);
941 do_nibble2!(8);
942 do_nibble2!(9);
943 do_nibble2!(10);
944 do_nibble2!(11);
945 do_nibble2!(12);
946 do_nibble2!(13);
947 do_nibble2!(14);
948 do_nibble2!(15);
949
950 _mm256_stream_si256(dp.add(i) as *mut _, r0);
951 _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
952 i += 64;
953 }
954
955 if i + 32 <= len {
957 let input = _mm256_loadu_si256(sp.add(i) as *const _);
958 let lo_nibble = _mm256_and_si256(input, lo_mask);
959 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
960
961 let mut result = _mm256_setzero_si256();
962 macro_rules! do_nibble {
963 ($h:expr) => {
964 let h_val = _mm256_set1_epi8($h as i8);
965 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
966 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
967 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
968 };
969 }
970 do_nibble!(0);
971 do_nibble!(1);
972 do_nibble!(2);
973 do_nibble!(3);
974 do_nibble!(4);
975 do_nibble!(5);
976 do_nibble!(6);
977 do_nibble!(7);
978 do_nibble!(8);
979 do_nibble!(9);
980 do_nibble!(10);
981 do_nibble!(11);
982 do_nibble!(12);
983 do_nibble!(13);
984 do_nibble!(14);
985 do_nibble!(15);
986
987 _mm256_stream_si256(dp.add(i) as *mut _, result);
988 i += 32;
989 }
990
991 if i + 16 <= len {
993 let lo_mask128 = _mm_set1_epi8(0x0F);
994 let mut lut128 = [_mm_setzero_si128(); 16];
995 for h in 0u8..16 {
996 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
997 }
998
999 let input = _mm_loadu_si128(sp.add(i) as *const _);
1000 let lo_nib = _mm_and_si128(input, lo_mask128);
1001 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
1002
1003 let mut res = _mm_setzero_si128();
1004 macro_rules! do_nibble128 {
1005 ($h:expr) => {
1006 let h_val = _mm_set1_epi8($h as i8);
1007 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
1008 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
1009 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
1010 };
1011 }
1012 do_nibble128!(0);
1013 do_nibble128!(1);
1014 do_nibble128!(2);
1015 do_nibble128!(3);
1016 do_nibble128!(4);
1017 do_nibble128!(5);
1018 do_nibble128!(6);
1019 do_nibble128!(7);
1020 do_nibble128!(8);
1021 do_nibble128!(9);
1022 do_nibble128!(10);
1023 do_nibble128!(11);
1024 do_nibble128!(12);
1025 do_nibble128!(13);
1026 do_nibble128!(14);
1027 do_nibble128!(15);
1028
1029 _mm_storeu_si128(dp.add(i) as *mut _, res);
1030 i += 16;
1031 }
1032
1033 while i < len {
1035 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1036 i += 1;
1037 }
1038
1039 _mm_sfence();
1041 }
1042}
1043
1044#[cfg(target_arch = "x86_64")]
1045#[target_feature(enable = "ssse3")]
1046unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
1047 use std::arch::x86_64::*;
1048
1049 unsafe {
1050 let len = src.len();
1051 let sp = src.as_ptr();
1052 let dp = dst.as_mut_ptr();
1053
1054 let mut lut = [_mm_setzero_si128(); 16];
1055 for h in 0u8..16 {
1056 let base = (h as usize) * 16;
1057 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
1058 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
1059 }
1060
1061 let lo_mask = _mm_set1_epi8(0x0F);
1062
1063 let mut i = 0;
1064 while i + 16 <= len {
1065 let input = _mm_loadu_si128(sp.add(i) as *const _);
1066 let lo_nibble = _mm_and_si128(input, lo_mask);
1067 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
1068
1069 let mut result = _mm_setzero_si128();
1070
1071 macro_rules! do_nibble {
1072 ($h:expr) => {
1073 let h_val = _mm_set1_epi8($h as i8);
1074 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
1075 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
1076 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
1077 };
1078 }
1079 do_nibble!(0);
1080 do_nibble!(1);
1081 do_nibble!(2);
1082 do_nibble!(3);
1083 do_nibble!(4);
1084 do_nibble!(5);
1085 do_nibble!(6);
1086 do_nibble!(7);
1087 do_nibble!(8);
1088 do_nibble!(9);
1089 do_nibble!(10);
1090 do_nibble!(11);
1091 do_nibble!(12);
1092 do_nibble!(13);
1093 do_nibble!(14);
1094 do_nibble!(15);
1095
1096 _mm_storeu_si128(dp.add(i) as *mut _, result);
1097 i += 16;
1098 }
1099
1100 while i < len {
1102 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1103 i += 1;
1104 }
1105 }
1106}
1107
1108#[inline]
1116fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
1117 let mut lo: Option<u8> = None;
1118 let mut hi = 0u8;
1119 let mut offset = 0i16;
1120
1121 for i in 0..256 {
1122 if table[i] != i as u8 {
1123 let diff = table[i] as i16 - i as i16;
1124 match lo {
1125 None => {
1126 lo = Some(i as u8);
1127 hi = i as u8;
1128 offset = diff;
1129 }
1130 Some(_) => {
1131 if diff != offset || i as u8 != hi.wrapping_add(1) {
1132 return None;
1133 }
1134 hi = i as u8;
1135 }
1136 }
1137 }
1138 }
1139
1140 lo.map(|l| (l, hi, offset as i8))
1141}
1142
1143#[inline]
1148fn detect_range_to_constant(table: &[u8; 256]) -> Option<(u8, u8, u8)> {
1149 let mut lo: Option<u8> = None;
1150 let mut hi = 0u8;
1151 let mut replacement = 0u8;
1152
1153 for i in 0..256 {
1154 if table[i] != i as u8 {
1155 match lo {
1156 None => {
1157 lo = Some(i as u8);
1158 hi = i as u8;
1159 replacement = table[i];
1160 }
1161 Some(_) => {
1162 if table[i] != replacement || i as u8 != hi.wrapping_add(1) {
1163 return None;
1164 }
1165 hi = i as u8;
1166 }
1167 }
1168 }
1169 }
1170
1171 lo.map(|l| (l, hi, replacement))
1172}
1173
1174#[cfg(target_arch = "x86_64")]
1179fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1180 if get_simd_level() >= 3 {
1181 unsafe { translate_range_to_constant_avx2_inplace(data, lo, hi, replacement) };
1182 } else {
1183 unsafe { translate_range_to_constant_sse2_inplace(data, lo, hi, replacement) };
1184 }
1185}
1186
1187#[cfg(target_arch = "x86_64")]
1188#[target_feature(enable = "avx2")]
1189unsafe fn translate_range_to_constant_avx2_inplace(
1190 data: &mut [u8],
1191 lo: u8,
1192 hi: u8,
1193 replacement: u8,
1194) {
1195 use std::arch::x86_64::*;
1196
1197 unsafe {
1198 let range = hi - lo;
1199 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1200 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1201 let repl_v = _mm256_set1_epi8(replacement as i8);
1202 let zero = _mm256_setzero_si256();
1203
1204 let len = data.len();
1205 let ptr = data.as_mut_ptr();
1206 let mut i = 0;
1207
1208 while i + 64 <= len {
1210 let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1211 let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1212 let bi0 = _mm256_add_epi8(in0, bias_v);
1213 let bi1 = _mm256_add_epi8(in1, bias_v);
1214 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1215 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1216 let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1217 let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1218 let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1219 let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1220 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1221 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1222 i += 64;
1223 }
1224
1225 if i + 32 <= len {
1227 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1228 let biased = _mm256_add_epi8(input, bias_v);
1229 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1230 let in_range = _mm256_cmpeq_epi8(gt, zero);
1231 let result = _mm256_blendv_epi8(input, repl_v, in_range);
1232 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1233 i += 32;
1234 }
1235
1236 if i + 16 <= len {
1237 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1238 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1239 let repl_v128 = _mm_set1_epi8(replacement as i8);
1240 let zero128 = _mm_setzero_si128();
1241
1242 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1243 let biased = _mm_add_epi8(input, bias_v128);
1244 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1245 let in_range = _mm_cmpeq_epi8(gt, zero128);
1246 let result = _mm_blendv_epi8(input, repl_v128, in_range);
1247 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1248 i += 16;
1249 }
1250
1251 while i < len {
1252 let b = *ptr.add(i);
1253 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1254 i += 1;
1255 }
1256 }
1257}
1258
1259#[cfg(target_arch = "x86_64")]
1260#[target_feature(enable = "sse2")]
1261unsafe fn translate_range_to_constant_sse2_inplace(
1262 data: &mut [u8],
1263 lo: u8,
1264 hi: u8,
1265 replacement: u8,
1266) {
1267 use std::arch::x86_64::*;
1268
1269 unsafe {
1270 let range = hi - lo;
1271 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1272 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1273 let repl_v = _mm_set1_epi8(replacement as i8);
1274 let zero = _mm_setzero_si128();
1275
1276 let len = data.len();
1277 let ptr = data.as_mut_ptr();
1278 let mut i = 0;
1279
1280 while i + 16 <= len {
1281 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1282 let biased = _mm_add_epi8(input, bias_v);
1283 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1284 let in_range = _mm_cmpeq_epi8(gt, zero);
1286 let result = _mm_or_si128(
1288 _mm_and_si128(in_range, repl_v),
1289 _mm_andnot_si128(in_range, input),
1290 );
1291 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1292 i += 16;
1293 }
1294
1295 while i < len {
1296 let b = *ptr.add(i);
1297 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1298 i += 1;
1299 }
1300 }
1301}
1302
1303#[cfg(target_arch = "aarch64")]
1304fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1305 unsafe { translate_range_to_constant_neon_inplace(data, lo, hi, replacement) };
1306}
1307
1308#[cfg(target_arch = "aarch64")]
1309#[target_feature(enable = "neon")]
1310unsafe fn translate_range_to_constant_neon_inplace(
1311 data: &mut [u8],
1312 lo: u8,
1313 hi: u8,
1314 replacement: u8,
1315) {
1316 use std::arch::aarch64::*;
1317
1318 unsafe {
1319 let len = data.len();
1320 let ptr = data.as_mut_ptr();
1321 let lo_v = vdupq_n_u8(lo);
1322 let hi_v = vdupq_n_u8(hi);
1323 let repl_v = vdupq_n_u8(replacement);
1324 let mut i = 0;
1325
1326 while i + 32 <= len {
1327 let in0 = vld1q_u8(ptr.add(i));
1328 let in1 = vld1q_u8(ptr.add(i + 16));
1329 let ge0 = vcgeq_u8(in0, lo_v);
1330 let le0 = vcleq_u8(in0, hi_v);
1331 let mask0 = vandq_u8(ge0, le0);
1332 let ge1 = vcgeq_u8(in1, lo_v);
1333 let le1 = vcleq_u8(in1, hi_v);
1334 let mask1 = vandq_u8(ge1, le1);
1335 vst1q_u8(ptr.add(i), vbslq_u8(mask0, repl_v, in0));
1337 vst1q_u8(ptr.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1338 i += 32;
1339 }
1340
1341 if i + 16 <= len {
1342 let input = vld1q_u8(ptr.add(i));
1343 let ge = vcgeq_u8(input, lo_v);
1344 let le = vcleq_u8(input, hi_v);
1345 let mask = vandq_u8(ge, le);
1346 vst1q_u8(ptr.add(i), vbslq_u8(mask, repl_v, input));
1347 i += 16;
1348 }
1349
1350 while i < len {
1351 let b = *ptr.add(i);
1352 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1353 i += 1;
1354 }
1355 }
1356}
1357
1358#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1359fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1360 for b in data.iter_mut() {
1361 if *b >= lo && *b <= hi {
1362 *b = replacement;
1363 }
1364 }
1365}
1366
1367#[cfg(target_arch = "x86_64")]
1370fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1371 if get_simd_level() >= 3 {
1372 unsafe { translate_range_to_constant_avx2(src, dst, lo, hi, replacement) };
1373 } else {
1374 unsafe { translate_range_to_constant_sse2(src, dst, lo, hi, replacement) };
1375 }
1376}
1377
1378#[cfg(target_arch = "aarch64")]
1379fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1380 unsafe { translate_range_to_constant_neon(src, dst, lo, hi, replacement) };
1381}
1382
1383#[cfg(target_arch = "aarch64")]
1384#[target_feature(enable = "neon")]
1385unsafe fn translate_range_to_constant_neon(
1386 src: &[u8],
1387 dst: &mut [u8],
1388 lo: u8,
1389 hi: u8,
1390 replacement: u8,
1391) {
1392 use std::arch::aarch64::*;
1393
1394 unsafe {
1395 let len = src.len();
1396 let sp = src.as_ptr();
1397 let dp = dst.as_mut_ptr();
1398 let lo_v = vdupq_n_u8(lo);
1399 let hi_v = vdupq_n_u8(hi);
1400 let repl_v = vdupq_n_u8(replacement);
1401 let mut i = 0;
1402
1403 while i + 32 <= len {
1404 let in0 = vld1q_u8(sp.add(i));
1405 let in1 = vld1q_u8(sp.add(i + 16));
1406 let mask0 = vandq_u8(vcgeq_u8(in0, lo_v), vcleq_u8(in0, hi_v));
1407 let mask1 = vandq_u8(vcgeq_u8(in1, lo_v), vcleq_u8(in1, hi_v));
1408 vst1q_u8(dp.add(i), vbslq_u8(mask0, repl_v, in0));
1409 vst1q_u8(dp.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1410 i += 32;
1411 }
1412
1413 if i + 16 <= len {
1414 let input = vld1q_u8(sp.add(i));
1415 let mask = vandq_u8(vcgeq_u8(input, lo_v), vcleq_u8(input, hi_v));
1416 vst1q_u8(dp.add(i), vbslq_u8(mask, repl_v, input));
1417 i += 16;
1418 }
1419
1420 while i < len {
1421 let b = *sp.add(i);
1422 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1423 i += 1;
1424 }
1425 }
1426}
1427
1428#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1429fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1430 for (i, &b) in src.iter().enumerate() {
1431 unsafe {
1432 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi { replacement } else { b };
1433 }
1434 }
1435}
1436
1437#[cfg(target_arch = "x86_64")]
1438#[target_feature(enable = "avx2")]
1439unsafe fn translate_range_to_constant_avx2(
1440 src: &[u8],
1441 dst: &mut [u8],
1442 lo: u8,
1443 hi: u8,
1444 replacement: u8,
1445) {
1446 use std::arch::x86_64::*;
1447 unsafe {
1448 let range = hi - lo;
1449 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1450 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1451 let repl_v = _mm256_set1_epi8(replacement as i8);
1452 let zero = _mm256_setzero_si256();
1453 let len = src.len();
1454 let sp = src.as_ptr();
1455 let dp = dst.as_mut_ptr();
1456 let mut i = 0;
1457 while i + 64 <= len {
1458 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1459 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1460 let bi0 = _mm256_add_epi8(in0, bias_v);
1461 let bi1 = _mm256_add_epi8(in1, bias_v);
1462 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1463 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1464 let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1465 let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1466 let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1467 let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1468 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1469 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1470 i += 64;
1471 }
1472 if i + 32 <= len {
1473 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1474 let biased = _mm256_add_epi8(input, bias_v);
1475 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1476 let in_range = _mm256_cmpeq_epi8(gt, zero);
1477 let result = _mm256_blendv_epi8(input, repl_v, in_range);
1478 _mm256_storeu_si256(dp.add(i) as *mut _, result);
1479 i += 32;
1480 }
1481 while i < len {
1482 let b = *sp.add(i);
1483 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1484 i += 1;
1485 }
1486 }
1487}
1488
1489#[cfg(target_arch = "x86_64")]
1490#[target_feature(enable = "sse2")]
1491unsafe fn translate_range_to_constant_sse2(
1492 src: &[u8],
1493 dst: &mut [u8],
1494 lo: u8,
1495 hi: u8,
1496 replacement: u8,
1497) {
1498 use std::arch::x86_64::*;
1499 unsafe {
1500 let range = hi - lo;
1501 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1502 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1503 let repl_v = _mm_set1_epi8(replacement as i8);
1504 let zero = _mm_setzero_si128();
1505 let len = src.len();
1506 let sp = src.as_ptr();
1507 let dp = dst.as_mut_ptr();
1508 let mut i = 0;
1509 while i + 16 <= len {
1510 let input = _mm_loadu_si128(sp.add(i) as *const _);
1511 let biased = _mm_add_epi8(input, bias_v);
1512 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1513 let in_range = _mm_cmpeq_epi8(gt, zero);
1514 let result = _mm_or_si128(
1515 _mm_and_si128(in_range, repl_v),
1516 _mm_andnot_si128(in_range, input),
1517 );
1518 _mm_storeu_si128(dp.add(i) as *mut _, result);
1519 i += 16;
1520 }
1521 while i < len {
1522 let b = *sp.add(i);
1523 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1524 i += 1;
1525 }
1526 }
1527}
1528
1529#[cfg(target_arch = "x86_64")]
1536fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1537 if get_simd_level() >= 3 {
1538 if dst.as_ptr() as usize & 31 == 0 {
1540 unsafe { translate_range_avx2_nt(src, dst, lo, hi, offset) };
1541 } else {
1542 unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
1543 }
1544 } else {
1545 unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
1546 }
1547}
1548
1549#[cfg(target_arch = "x86_64")]
1550#[target_feature(enable = "avx2")]
1551unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1552 use std::arch::x86_64::*;
1553
1554 unsafe {
1555 let range = hi - lo;
1556 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1561 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1562 let offset_v = _mm256_set1_epi8(offset);
1563 let zero = _mm256_setzero_si256();
1564
1565 let len = src.len();
1566 let sp = src.as_ptr();
1567 let dp = dst.as_mut_ptr();
1568 let mut i = 0;
1569
1570 while i + 64 <= len {
1573 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1574 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1575 let bi0 = _mm256_add_epi8(in0, bias_v);
1576 let bi1 = _mm256_add_epi8(in1, bias_v);
1577 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1578 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1579 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1580 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1581 let om0 = _mm256_and_si256(m0, offset_v);
1582 let om1 = _mm256_and_si256(m1, offset_v);
1583 let r0 = _mm256_add_epi8(in0, om0);
1584 let r1 = _mm256_add_epi8(in1, om1);
1585 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1586 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1587 i += 64;
1588 }
1589
1590 if i + 32 <= len {
1592 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1593 let biased = _mm256_add_epi8(input, bias_v);
1594 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1595 let mask = _mm256_cmpeq_epi8(gt, zero);
1596 let offset_masked = _mm256_and_si256(mask, offset_v);
1597 let result = _mm256_add_epi8(input, offset_masked);
1598 _mm256_storeu_si256(dp.add(i) as *mut _, result);
1599 i += 32;
1600 }
1601
1602 if i + 16 <= len {
1604 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1605 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1606 let offset_v128 = _mm_set1_epi8(offset);
1607 let zero128 = _mm_setzero_si128();
1608
1609 let input = _mm_loadu_si128(sp.add(i) as *const _);
1610 let biased = _mm_add_epi8(input, bias_v128);
1611 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1612 let mask = _mm_cmpeq_epi8(gt, zero128);
1613 let offset_masked = _mm_and_si128(mask, offset_v128);
1614 let result = _mm_add_epi8(input, offset_masked);
1615 _mm_storeu_si128(dp.add(i) as *mut _, result);
1616 i += 16;
1617 }
1618
1619 while i < len {
1621 let b = *sp.add(i);
1622 *dp.add(i) = if b >= lo && b <= hi {
1623 b.wrapping_add(offset as u8)
1624 } else {
1625 b
1626 };
1627 i += 1;
1628 }
1629 }
1630}
1631
1632#[cfg(target_arch = "x86_64")]
1638#[target_feature(enable = "avx2")]
1639unsafe fn translate_range_avx2_nt(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1640 use std::arch::x86_64::*;
1641
1642 unsafe {
1643 let range = hi - lo;
1644 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1645 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1646 let offset_v = _mm256_set1_epi8(offset);
1647 let zero = _mm256_setzero_si256();
1648
1649 let len = src.len();
1650 let sp = src.as_ptr();
1651 let dp = dst.as_mut_ptr();
1652 let mut i = 0;
1653
1654 while i + 64 <= len {
1656 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1657 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1658 let bi0 = _mm256_add_epi8(in0, bias_v);
1659 let bi1 = _mm256_add_epi8(in1, bias_v);
1660 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1661 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1662 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1663 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1664 let om0 = _mm256_and_si256(m0, offset_v);
1665 let om1 = _mm256_and_si256(m1, offset_v);
1666 let r0 = _mm256_add_epi8(in0, om0);
1667 let r1 = _mm256_add_epi8(in1, om1);
1668 _mm256_stream_si256(dp.add(i) as *mut _, r0);
1669 _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
1670 i += 64;
1671 }
1672
1673 if i + 32 <= len {
1675 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1676 let biased = _mm256_add_epi8(input, bias_v);
1677 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1678 let mask = _mm256_cmpeq_epi8(gt, zero);
1679 let offset_masked = _mm256_and_si256(mask, offset_v);
1680 let result = _mm256_add_epi8(input, offset_masked);
1681 _mm256_stream_si256(dp.add(i) as *mut _, result);
1682 i += 32;
1683 }
1684
1685 if i + 16 <= len {
1687 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1688 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1689 let offset_v128 = _mm_set1_epi8(offset);
1690 let zero128 = _mm_setzero_si128();
1691
1692 let input = _mm_loadu_si128(sp.add(i) as *const _);
1693 let biased = _mm_add_epi8(input, bias_v128);
1694 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1695 let mask = _mm_cmpeq_epi8(gt, zero128);
1696 let offset_masked = _mm_and_si128(mask, offset_v128);
1697 let result = _mm_add_epi8(input, offset_masked);
1698 _mm_storeu_si128(dp.add(i) as *mut _, result);
1699 i += 16;
1700 }
1701
1702 while i < len {
1704 let b = *sp.add(i);
1705 *dp.add(i) = if b >= lo && b <= hi {
1706 b.wrapping_add(offset as u8)
1707 } else {
1708 b
1709 };
1710 i += 1;
1711 }
1712
1713 _mm_sfence();
1715 }
1716}
1717
1718#[cfg(target_arch = "x86_64")]
1719#[target_feature(enable = "sse2")]
1720unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1721 use std::arch::x86_64::*;
1722
1723 unsafe {
1724 let range = hi - lo;
1725 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1726 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1727 let offset_v = _mm_set1_epi8(offset);
1728 let zero = _mm_setzero_si128();
1729
1730 let len = src.len();
1731 let mut i = 0;
1732
1733 while i + 16 <= len {
1734 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
1735 let biased = _mm_add_epi8(input, bias_v);
1736 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1737 let mask = _mm_cmpeq_epi8(gt, zero);
1738 let offset_masked = _mm_and_si128(mask, offset_v);
1739 let result = _mm_add_epi8(input, offset_masked);
1740 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
1741 i += 16;
1742 }
1743
1744 while i < len {
1745 let b = *src.get_unchecked(i);
1746 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
1747 b.wrapping_add(offset as u8)
1748 } else {
1749 b
1750 };
1751 i += 1;
1752 }
1753 }
1754}
1755
1756#[cfg(target_arch = "aarch64")]
1759fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1760 unsafe { translate_range_neon(src, dst, lo, hi, offset) };
1761}
1762
1763#[cfg(target_arch = "aarch64")]
1764#[target_feature(enable = "neon")]
1765unsafe fn translate_range_neon(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1766 use std::arch::aarch64::*;
1767
1768 unsafe {
1769 let len = src.len();
1770 let sp = src.as_ptr();
1771 let dp = dst.as_mut_ptr();
1772 let lo_v = vdupq_n_u8(lo);
1773 let hi_v = vdupq_n_u8(hi);
1774 let offset_v = vdupq_n_s8(offset);
1775 let mut i = 0;
1776
1777 while i + 32 <= len {
1779 let in0 = vld1q_u8(sp.add(i));
1780 let in1 = vld1q_u8(sp.add(i + 16));
1781 let ge0 = vcgeq_u8(in0, lo_v);
1783 let le0 = vcleq_u8(in0, hi_v);
1784 let mask0 = vandq_u8(ge0, le0);
1785 let ge1 = vcgeq_u8(in1, lo_v);
1786 let le1 = vcleq_u8(in1, hi_v);
1787 let mask1 = vandq_u8(ge1, le1);
1788 let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
1790 let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
1791 let r0 = vaddq_u8(in0, off0);
1792 let r1 = vaddq_u8(in1, off1);
1793 vst1q_u8(dp.add(i), r0);
1794 vst1q_u8(dp.add(i + 16), r1);
1795 i += 32;
1796 }
1797
1798 if i + 16 <= len {
1799 let input = vld1q_u8(sp.add(i));
1800 let ge = vcgeq_u8(input, lo_v);
1801 let le = vcleq_u8(input, hi_v);
1802 let mask = vandq_u8(ge, le);
1803 let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
1804 vst1q_u8(dp.add(i), vaddq_u8(input, off));
1805 i += 16;
1806 }
1807
1808 while i < len {
1809 let b = *sp.add(i);
1810 *dp.add(i) = if b >= lo && b <= hi {
1811 b.wrapping_add(offset as u8)
1812 } else {
1813 b
1814 };
1815 i += 1;
1816 }
1817 }
1818}
1819
1820#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1822fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1823 let offset_u8 = offset as u8;
1824 let range = hi.wrapping_sub(lo);
1825 unsafe {
1826 let sp = src.as_ptr();
1827 let dp = dst.as_mut_ptr();
1828 let len = src.len();
1829 let mut i = 0;
1830 while i + 8 <= len {
1831 macro_rules! do_byte {
1832 ($off:expr) => {{
1833 let b = *sp.add(i + $off);
1834 let in_range = b.wrapping_sub(lo) <= range;
1835 *dp.add(i + $off) = if in_range {
1836 b.wrapping_add(offset_u8)
1837 } else {
1838 b
1839 };
1840 }};
1841 }
1842 do_byte!(0);
1843 do_byte!(1);
1844 do_byte!(2);
1845 do_byte!(3);
1846 do_byte!(4);
1847 do_byte!(5);
1848 do_byte!(6);
1849 do_byte!(7);
1850 i += 8;
1851 }
1852 while i < len {
1853 let b = *sp.add(i);
1854 let in_range = b.wrapping_sub(lo) <= range;
1855 *dp.add(i) = if in_range {
1856 b.wrapping_add(offset_u8)
1857 } else {
1858 b
1859 };
1860 i += 1;
1861 }
1862 }
1863}
1864
1865#[cfg(target_arch = "x86_64")]
1873fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1874 if get_simd_level() >= 3 {
1875 unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
1876 } else {
1877 unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
1878 }
1879}
1880
1881#[cfg(target_arch = "x86_64")]
1882#[target_feature(enable = "avx2")]
1883unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1884 use std::arch::x86_64::*;
1885
1886 unsafe {
1887 let range = hi - lo;
1888 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1889 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1890 let offset_v = _mm256_set1_epi8(offset);
1891 let zero = _mm256_setzero_si256();
1892
1893 let len = data.len();
1894 let ptr = data.as_mut_ptr();
1895 let mut i = 0;
1896
1897 while i + 64 <= len {
1899 let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1900 let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1901 let bi0 = _mm256_add_epi8(in0, bias_v);
1902 let bi1 = _mm256_add_epi8(in1, bias_v);
1903 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1904 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1905 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1906 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1907 let om0 = _mm256_and_si256(m0, offset_v);
1908 let om1 = _mm256_and_si256(m1, offset_v);
1909 let r0 = _mm256_add_epi8(in0, om0);
1910 let r1 = _mm256_add_epi8(in1, om1);
1911 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1912 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1913 i += 64;
1914 }
1915
1916 if i + 32 <= len {
1918 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1919 let biased = _mm256_add_epi8(input, bias_v);
1920 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1921 let mask = _mm256_cmpeq_epi8(gt, zero);
1922 let offset_masked = _mm256_and_si256(mask, offset_v);
1923 let result = _mm256_add_epi8(input, offset_masked);
1924 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1925 i += 32;
1926 }
1927
1928 if i + 16 <= len {
1929 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1930 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1931 let offset_v128 = _mm_set1_epi8(offset);
1932 let zero128 = _mm_setzero_si128();
1933
1934 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1935 let biased = _mm_add_epi8(input, bias_v128);
1936 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1937 let mask = _mm_cmpeq_epi8(gt, zero128);
1938 let offset_masked = _mm_and_si128(mask, offset_v128);
1939 let result = _mm_add_epi8(input, offset_masked);
1940 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1941 i += 16;
1942 }
1943
1944 while i < len {
1945 let b = *ptr.add(i);
1946 *ptr.add(i) = if b >= lo && b <= hi {
1947 b.wrapping_add(offset as u8)
1948 } else {
1949 b
1950 };
1951 i += 1;
1952 }
1953 }
1954}
1955
1956#[cfg(target_arch = "x86_64")]
1957#[target_feature(enable = "sse2")]
1958unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1959 use std::arch::x86_64::*;
1960
1961 unsafe {
1962 let range = hi - lo;
1963 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1964 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1965 let offset_v = _mm_set1_epi8(offset);
1966 let zero = _mm_setzero_si128();
1967
1968 let len = data.len();
1969 let ptr = data.as_mut_ptr();
1970 let mut i = 0;
1971
1972 while i + 16 <= len {
1973 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1974 let biased = _mm_add_epi8(input, bias_v);
1975 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1976 let mask = _mm_cmpeq_epi8(gt, zero);
1977 let offset_masked = _mm_and_si128(mask, offset_v);
1978 let result = _mm_add_epi8(input, offset_masked);
1979 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1980 i += 16;
1981 }
1982
1983 while i < len {
1984 let b = *ptr.add(i);
1985 *ptr.add(i) = if b >= lo && b <= hi {
1986 b.wrapping_add(offset as u8)
1987 } else {
1988 b
1989 };
1990 i += 1;
1991 }
1992 }
1993}
1994
1995#[cfg(target_arch = "aarch64")]
1996fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1997 unsafe { translate_range_neon_inplace(data, lo, hi, offset) };
1998}
1999
2000#[cfg(target_arch = "aarch64")]
2001#[target_feature(enable = "neon")]
2002unsafe fn translate_range_neon_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2003 use std::arch::aarch64::*;
2004
2005 unsafe {
2006 let len = data.len();
2007 let ptr = data.as_mut_ptr();
2008 let lo_v = vdupq_n_u8(lo);
2009 let hi_v = vdupq_n_u8(hi);
2010 let offset_v = vdupq_n_s8(offset);
2011 let mut i = 0;
2012
2013 while i + 32 <= len {
2014 let in0 = vld1q_u8(ptr.add(i));
2015 let in1 = vld1q_u8(ptr.add(i + 16));
2016 let ge0 = vcgeq_u8(in0, lo_v);
2017 let le0 = vcleq_u8(in0, hi_v);
2018 let mask0 = vandq_u8(ge0, le0);
2019 let ge1 = vcgeq_u8(in1, lo_v);
2020 let le1 = vcleq_u8(in1, hi_v);
2021 let mask1 = vandq_u8(ge1, le1);
2022 let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
2023 let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
2024 vst1q_u8(ptr.add(i), vaddq_u8(in0, off0));
2025 vst1q_u8(ptr.add(i + 16), vaddq_u8(in1, off1));
2026 i += 32;
2027 }
2028
2029 if i + 16 <= len {
2030 let input = vld1q_u8(ptr.add(i));
2031 let ge = vcgeq_u8(input, lo_v);
2032 let le = vcleq_u8(input, hi_v);
2033 let mask = vandq_u8(ge, le);
2034 let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
2035 vst1q_u8(ptr.add(i), vaddq_u8(input, off));
2036 i += 16;
2037 }
2038
2039 while i < len {
2040 let b = *ptr.add(i);
2041 if b >= lo && b <= hi {
2042 *ptr.add(i) = b.wrapping_add(offset as u8);
2043 }
2044 i += 1;
2045 }
2046 }
2047}
2048
2049#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
2050fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2051 let offset_u8 = offset as u8;
2052 let range = hi.wrapping_sub(lo);
2053 for b in data.iter_mut() {
2054 if b.wrapping_sub(lo) <= range {
2055 *b = b.wrapping_add(offset_u8);
2056 }
2057 }
2058}
2059
2060#[inline]
2070fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
2071 if chars.is_empty() {
2072 return None;
2073 }
2074 let mut lo = chars[0];
2075 let mut hi = chars[0];
2076 for &c in &chars[1..] {
2077 if c < lo {
2078 lo = c;
2079 }
2080 if c > hi {
2081 hi = c;
2082 }
2083 }
2084 if (hi as usize - lo as usize + 1) == chars.len() {
2087 Some((lo, hi))
2088 } else {
2089 None
2090 }
2091}
2092
2093#[cfg(target_arch = "x86_64")]
2097fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2098 if get_simd_level() >= 3 {
2099 unsafe { delete_range_avx2(src, dst, lo, hi) }
2100 } else {
2101 unsafe { delete_range_sse2(src, dst, lo, hi) }
2102 }
2103}
2104
2105#[cfg(target_arch = "x86_64")]
2106#[target_feature(enable = "avx2")]
2107unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2108 use std::arch::x86_64::*;
2109
2110 unsafe {
2111 let range = hi - lo;
2112 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2113 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2114 let zero = _mm256_setzero_si256();
2115
2116 let len = src.len();
2117 let sp = src.as_ptr();
2118 let dp = dst.as_mut_ptr();
2119 let mut ri = 0;
2120 let mut wp = 0;
2121
2122 while ri + 32 <= len {
2123 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
2124 let biased = _mm256_add_epi8(input, bias_v);
2125 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2127 let in_range = _mm256_cmpeq_epi8(gt, zero);
2129 let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
2131
2132 if keep_mask == 0xFFFFFFFF {
2133 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
2135 wp += 32;
2136 } else if keep_mask != 0 {
2137 let m0 = keep_mask as u8;
2142 let m1 = (keep_mask >> 8) as u8;
2143 let m2 = (keep_mask >> 16) as u8;
2144 let m3 = (keep_mask >> 24) as u8;
2145
2146 if m0 == 0xFF {
2147 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2148 } else if m0 != 0 {
2149 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2150 }
2151 let c0 = m0.count_ones() as usize;
2152
2153 if m1 == 0xFF {
2154 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2155 } else if m1 != 0 {
2156 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2157 }
2158 let c1 = m1.count_ones() as usize;
2159
2160 if m2 == 0xFF {
2161 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
2162 } else if m2 != 0 {
2163 compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
2164 }
2165 let c2 = m2.count_ones() as usize;
2166
2167 if m3 == 0xFF {
2168 std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
2169 } else if m3 != 0 {
2170 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
2171 }
2172 let c3 = m3.count_ones() as usize;
2173 wp += c0 + c1 + c2 + c3;
2174 }
2175 ri += 32;
2177 }
2178
2179 if ri + 16 <= len {
2181 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2182 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2183 let zero128 = _mm_setzero_si128();
2184
2185 let input = _mm_loadu_si128(sp.add(ri) as *const _);
2186 let biased = _mm_add_epi8(input, bias_v128);
2187 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
2188 let in_range = _mm_cmpeq_epi8(gt, zero128);
2189 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2190
2191 if keep_mask == 0xFFFF {
2192 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2193 wp += 16;
2194 } else if keep_mask != 0 {
2195 let m0 = keep_mask as u8;
2196 let m1 = (keep_mask >> 8) as u8;
2197 if m0 == 0xFF {
2198 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2199 } else if m0 != 0 {
2200 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2201 }
2202 let c0 = m0.count_ones() as usize;
2203 if m1 == 0xFF {
2204 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2205 } else if m1 != 0 {
2206 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2207 }
2208 wp += c0 + m1.count_ones() as usize;
2209 }
2210 ri += 16;
2211 }
2212
2213 while ri < len {
2215 let b = *sp.add(ri);
2216 *dp.add(wp) = b;
2217 wp += (b < lo || b > hi) as usize;
2218 ri += 1;
2219 }
2220
2221 wp
2222 }
2223}
2224
2225#[cfg(target_arch = "x86_64")]
2233#[inline(always)]
2234unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
2235 unsafe {
2236 let idx = COMPACT_LUT.get_unchecked(mask as usize);
2237 *dst = *src.add(*idx.get_unchecked(0) as usize);
2238 *dst.add(1) = *src.add(*idx.get_unchecked(1) as usize);
2239 *dst.add(2) = *src.add(*idx.get_unchecked(2) as usize);
2240 *dst.add(3) = *src.add(*idx.get_unchecked(3) as usize);
2241 *dst.add(4) = *src.add(*idx.get_unchecked(4) as usize);
2242 *dst.add(5) = *src.add(*idx.get_unchecked(5) as usize);
2243 *dst.add(6) = *src.add(*idx.get_unchecked(6) as usize);
2244 *dst.add(7) = *src.add(*idx.get_unchecked(7) as usize);
2245 }
2246}
2247
2248#[cfg(target_arch = "x86_64")]
2253#[target_feature(enable = "ssse3")]
2254#[inline]
2255unsafe fn compact_8bytes_simd(src: *const u8, dst: *mut u8, mask: u8) {
2256 use std::arch::x86_64::*;
2257 unsafe {
2258 let src_v = _mm_loadl_epi64(src as *const _);
2259 let shuf = _mm_loadl_epi64(COMPACT_LUT.get_unchecked(mask as usize).as_ptr() as *const _);
2260 let out_v = _mm_shuffle_epi8(src_v, shuf);
2261 _mm_storel_epi64(dst as *mut _, out_v);
2262 }
2263}
2264
2265#[cfg(target_arch = "x86_64")]
2266#[target_feature(enable = "sse2")]
2267unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2268 use std::arch::x86_64::*;
2269
2270 unsafe {
2271 let range = hi - lo;
2272 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2273 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2274 let zero = _mm_setzero_si128();
2275
2276 let len = src.len();
2277 let sp = src.as_ptr();
2278 let dp = dst.as_mut_ptr();
2279 let mut ri = 0;
2280 let mut wp = 0;
2281
2282 while ri + 16 <= len {
2283 let input = _mm_loadu_si128(sp.add(ri) as *const _);
2284 let biased = _mm_add_epi8(input, bias_v);
2285 let gt = _mm_cmpgt_epi8(biased, threshold_v);
2286 let in_range = _mm_cmpeq_epi8(gt, zero);
2287 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2288
2289 if keep_mask == 0xFFFF {
2290 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2292 wp += 16;
2293 } else if keep_mask != 0 {
2294 let m0 = keep_mask as u8;
2295 let m1 = (keep_mask >> 8) as u8;
2296 if m0 == 0xFF {
2297 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2298 } else if m0 != 0 {
2299 compact_8bytes(sp.add(ri), dp.add(wp), m0);
2300 }
2301 let c0 = m0.count_ones() as usize;
2302 if m1 == 0xFF {
2303 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2304 } else if m1 != 0 {
2305 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2306 }
2307 wp += c0 + m1.count_ones() as usize;
2308 }
2309 ri += 16;
2310 }
2311
2312 while ri < len {
2314 let b = *sp.add(ri);
2315 *dp.add(wp) = b;
2316 wp += (b < lo || b > hi) as usize;
2317 ri += 1;
2318 }
2319
2320 wp
2321 }
2322}
2323
2324#[cfg(not(target_arch = "x86_64"))]
2328fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2329 let len = src.len();
2330 let sp = src.as_ptr();
2331 let dp = dst.as_mut_ptr();
2332 let mut wp: usize = 0;
2333 let mut i: usize = 0;
2334
2335 while i + 8 <= len {
2337 unsafe {
2338 let b0 = *sp.add(i);
2339 *dp.add(wp) = b0;
2340 wp += (b0 < lo || b0 > hi) as usize;
2341 let b1 = *sp.add(i + 1);
2342 *dp.add(wp) = b1;
2343 wp += (b1 < lo || b1 > hi) as usize;
2344 let b2 = *sp.add(i + 2);
2345 *dp.add(wp) = b2;
2346 wp += (b2 < lo || b2 > hi) as usize;
2347 let b3 = *sp.add(i + 3);
2348 *dp.add(wp) = b3;
2349 wp += (b3 < lo || b3 > hi) as usize;
2350 let b4 = *sp.add(i + 4);
2351 *dp.add(wp) = b4;
2352 wp += (b4 < lo || b4 > hi) as usize;
2353 let b5 = *sp.add(i + 5);
2354 *dp.add(wp) = b5;
2355 wp += (b5 < lo || b5 > hi) as usize;
2356 let b6 = *sp.add(i + 6);
2357 *dp.add(wp) = b6;
2358 wp += (b6 < lo || b6 > hi) as usize;
2359 let b7 = *sp.add(i + 7);
2360 *dp.add(wp) = b7;
2361 wp += (b7 < lo || b7 > hi) as usize;
2362 }
2363 i += 8;
2364 }
2365
2366 while i < len {
2368 unsafe {
2369 let b = *sp.add(i);
2370 *dp.add(wp) = b;
2371 wp += (b < lo || b > hi) as usize;
2372 }
2373 i += 1;
2374 }
2375
2376 wp
2377}
2378
2379fn delete_range_streaming(
2384 lo: u8,
2385 hi: u8,
2386 reader: &mut impl Read,
2387 writer: &mut impl Write,
2388) -> io::Result<()> {
2389 let mut buf = alloc_uninit_vec(STREAM_BUF);
2390 loop {
2391 let n = read_once(reader, &mut buf)?;
2392 if n == 0 {
2393 break;
2394 }
2395 let wp = delete_range_inplace(&mut buf, n, lo, hi);
2396 if wp > 0 {
2397 writer.write_all(&buf[..wp])?;
2398 }
2399 }
2400 Ok(())
2401}
2402
2403#[inline]
2406fn delete_range_inplace(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2407 #[cfg(target_arch = "x86_64")]
2408 {
2409 let level = get_simd_level();
2410 if level >= 3 {
2411 return unsafe { delete_range_inplace_avx2(buf, n, lo, hi) };
2412 }
2413 }
2414 let ptr = buf.as_mut_ptr();
2416 let mut ri = 0;
2417 let mut wp = 0;
2418 unsafe {
2419 while ri + 8 <= n {
2420 let b0 = *ptr.add(ri);
2421 let b1 = *ptr.add(ri + 1);
2422 let b2 = *ptr.add(ri + 2);
2423 let b3 = *ptr.add(ri + 3);
2424 let b4 = *ptr.add(ri + 4);
2425 let b5 = *ptr.add(ri + 5);
2426 let b6 = *ptr.add(ri + 6);
2427 let b7 = *ptr.add(ri + 7);
2428 *ptr.add(wp) = b0;
2429 wp += (b0 < lo || b0 > hi) as usize;
2430 *ptr.add(wp) = b1;
2431 wp += (b1 < lo || b1 > hi) as usize;
2432 *ptr.add(wp) = b2;
2433 wp += (b2 < lo || b2 > hi) as usize;
2434 *ptr.add(wp) = b3;
2435 wp += (b3 < lo || b3 > hi) as usize;
2436 *ptr.add(wp) = b4;
2437 wp += (b4 < lo || b4 > hi) as usize;
2438 *ptr.add(wp) = b5;
2439 wp += (b5 < lo || b5 > hi) as usize;
2440 *ptr.add(wp) = b6;
2441 wp += (b6 < lo || b6 > hi) as usize;
2442 *ptr.add(wp) = b7;
2443 wp += (b7 < lo || b7 > hi) as usize;
2444 ri += 8;
2445 }
2446 while ri < n {
2447 let b = *ptr.add(ri);
2448 *ptr.add(wp) = b;
2449 wp += (b < lo || b > hi) as usize;
2450 ri += 1;
2451 }
2452 }
2453 wp
2454}
2455
2456#[cfg(target_arch = "x86_64")]
2459#[target_feature(enable = "avx2")]
2460unsafe fn delete_range_inplace_avx2(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2461 use std::arch::x86_64::*;
2462
2463 unsafe {
2464 let range = hi - lo;
2465 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2466 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2467 let zero = _mm256_setzero_si256();
2468
2469 let ptr = buf.as_mut_ptr();
2470 let mut ri = 0;
2471 let mut wp = 0;
2472
2473 while ri + 32 <= n {
2474 let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
2475 let biased = _mm256_add_epi8(input, bias_v);
2476 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2477 let in_range = _mm256_cmpeq_epi8(gt, zero);
2478 let del_mask = _mm256_movemask_epi8(in_range) as u32;
2479
2480 if del_mask == 0 {
2481 if wp != ri {
2483 std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
2484 }
2485 wp += 32;
2486 } else if del_mask != 0xFFFFFFFF {
2487 let keep_mask = !del_mask;
2492 let m0 = keep_mask as u8;
2493 let m1 = (keep_mask >> 8) as u8;
2494 let m2 = (keep_mask >> 16) as u8;
2495 let m3 = (keep_mask >> 24) as u8;
2496
2497 let c0 = m0.count_ones() as usize;
2498 let c1 = m1.count_ones() as usize;
2499 let c2 = m2.count_ones() as usize;
2500 let c3 = m3.count_ones() as usize;
2501
2502 if m0 == 0xFF {
2504 std::ptr::copy(ptr.add(ri), ptr.add(wp), 8);
2505 } else if m0 != 0 {
2506 let src_v = _mm_loadl_epi64(ptr.add(ri) as *const _);
2507 let shuf = _mm_loadl_epi64(COMPACT_LUT[m0 as usize].as_ptr() as *const _);
2508 let out_v = _mm_shuffle_epi8(src_v, shuf);
2509 _mm_storel_epi64(ptr.add(wp) as *mut _, out_v);
2510 }
2511
2512 if m1 == 0xFF {
2514 std::ptr::copy(ptr.add(ri + 8), ptr.add(wp + c0), 8);
2515 } else if m1 != 0 {
2516 let src_v = _mm_loadl_epi64(ptr.add(ri + 8) as *const _);
2517 let shuf = _mm_loadl_epi64(COMPACT_LUT[m1 as usize].as_ptr() as *const _);
2518 let out_v = _mm_shuffle_epi8(src_v, shuf);
2519 _mm_storel_epi64(ptr.add(wp + c0) as *mut _, out_v);
2520 }
2521
2522 if m2 == 0xFF {
2524 std::ptr::copy(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
2525 } else if m2 != 0 {
2526 let src_v = _mm_loadl_epi64(ptr.add(ri + 16) as *const _);
2527 let shuf = _mm_loadl_epi64(COMPACT_LUT[m2 as usize].as_ptr() as *const _);
2528 let out_v = _mm_shuffle_epi8(src_v, shuf);
2529 _mm_storel_epi64(ptr.add(wp + c0 + c1) as *mut _, out_v);
2530 }
2531
2532 if m3 == 0xFF {
2534 std::ptr::copy(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
2535 } else if m3 != 0 {
2536 let src_v = _mm_loadl_epi64(ptr.add(ri + 24) as *const _);
2537 let shuf = _mm_loadl_epi64(COMPACT_LUT[m3 as usize].as_ptr() as *const _);
2538 let out_v = _mm_shuffle_epi8(src_v, shuf);
2539 _mm_storel_epi64(ptr.add(wp + c0 + c1 + c2) as *mut _, out_v);
2540 }
2541
2542 wp += c0 + c1 + c2 + c3;
2543 }
2544 ri += 32;
2546 }
2547
2548 while ri < n {
2550 let b = *ptr.add(ri);
2551 *ptr.add(wp) = b;
2552 wp += (b < lo || b > hi) as usize;
2553 ri += 1;
2554 }
2555
2556 wp
2557 }
2558}
2559
2560pub fn translate(
2565 set1: &[u8],
2566 set2: &[u8],
2567 reader: &mut impl Read,
2568 writer: &mut impl Write,
2569) -> io::Result<()> {
2570 let table = build_translate_table(set1, set2);
2571
2572 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
2574 if is_identity {
2575 return passthrough_stream(reader, writer);
2576 }
2577
2578 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
2580 return translate_range_stream(lo, hi, offset, reader, writer);
2581 }
2582
2583 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
2586 return translate_range_to_constant_stream(lo, hi, replacement, reader, writer);
2587 }
2588
2589 let mut buf = alloc_uninit_vec(STREAM_BUF);
2594 loop {
2595 let n = read_once(reader, &mut buf)?;
2596 if n == 0 {
2597 break;
2598 }
2599 translate_and_write_table(&mut buf, n, &table, writer)?;
2600 }
2601 Ok(())
2602}
2603
2604#[inline]
2605fn translate_and_write_table(
2606 buf: &mut [u8],
2607 total: usize,
2608 table: &[u8; 256],
2609 writer: &mut impl Write,
2610) -> io::Result<()> {
2611 if total >= PARALLEL_THRESHOLD {
2612 let nt = rayon::current_num_threads().max(1);
2613 let cs = (total / nt).max(32 * 1024);
2614 buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2615 translate_inplace(chunk, table);
2616 });
2617 } else {
2618 translate_inplace(&mut buf[..total], table);
2619 }
2620 writer.write_all(&buf[..total])
2621}
2622
2623fn translate_range_stream(
2628 lo: u8,
2629 hi: u8,
2630 offset: i8,
2631 reader: &mut impl Read,
2632 writer: &mut impl Write,
2633) -> io::Result<()> {
2634 let mut buf = alloc_uninit_vec(STREAM_BUF);
2635 loop {
2636 let n = read_once(reader, &mut buf)?;
2637 if n == 0 {
2638 break;
2639 }
2640 translate_and_write_range(&mut buf, n, lo, hi, offset, writer)?;
2641 }
2642 Ok(())
2643}
2644
2645#[inline]
2646fn translate_and_write_range(
2647 buf: &mut [u8],
2648 total: usize,
2649 lo: u8,
2650 hi: u8,
2651 offset: i8,
2652 writer: &mut impl Write,
2653) -> io::Result<()> {
2654 if total >= PARALLEL_THRESHOLD {
2655 let nt = rayon::current_num_threads().max(1);
2656 let cs = (total / nt).max(32 * 1024);
2657 buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2658 translate_range_simd_inplace(chunk, lo, hi, offset);
2659 });
2660 } else {
2661 translate_range_simd_inplace(&mut buf[..total], lo, hi, offset);
2662 }
2663 writer.write_all(&buf[..total])
2664}
2665
2666fn translate_range_to_constant_stream(
2670 lo: u8,
2671 hi: u8,
2672 replacement: u8,
2673 reader: &mut impl Read,
2674 writer: &mut impl Write,
2675) -> io::Result<()> {
2676 let mut buf = alloc_uninit_vec(STREAM_BUF);
2677 loop {
2678 let n = read_once(reader, &mut buf)?;
2679 if n == 0 {
2680 break;
2681 }
2682 translate_and_write_range_const(&mut buf, n, lo, hi, replacement, writer)?;
2683 }
2684 Ok(())
2685}
2686
2687#[inline]
2688fn translate_and_write_range_const(
2689 buf: &mut [u8],
2690 total: usize,
2691 lo: u8,
2692 hi: u8,
2693 replacement: u8,
2694 writer: &mut impl Write,
2695) -> io::Result<()> {
2696 if total >= PARALLEL_THRESHOLD {
2697 let nt = rayon::current_num_threads().max(1);
2698 let cs = (total / nt).max(32 * 1024);
2699 buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2700 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
2701 });
2702 } else {
2703 translate_range_to_constant_simd_inplace(&mut buf[..total], lo, hi, replacement);
2704 }
2705 writer.write_all(&buf[..total])
2706}
2707
2708fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
2711 let mut buf = alloc_uninit_vec(STREAM_BUF);
2712 loop {
2713 let n = read_once(reader, &mut buf)?;
2714 if n == 0 {
2715 break;
2716 }
2717 writer.write_all(&buf[..n])?;
2718 }
2719 Ok(())
2720}
2721
2722#[inline]
2728fn read_once(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2729 loop {
2730 match reader.read(buf) {
2731 Ok(n) => return Ok(n),
2732 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2733 Err(e) => return Err(e),
2734 }
2735 }
2736}
2737
2738pub fn translate_squeeze(
2739 set1: &[u8],
2740 set2: &[u8],
2741 reader: &mut impl Read,
2742 writer: &mut impl Write,
2743) -> io::Result<()> {
2744 let table = build_translate_table(set1, set2);
2745 let squeeze_set = build_member_set(set2);
2746
2747 if set2.len() == 1 || (set2.len() > 1 && set2.iter().all(|&b| b == set2[0])) {
2750 let squeeze_ch = set2.last().copied().unwrap_or(0);
2751 return translate_squeeze_single_ch(&table, squeeze_ch, &squeeze_set, reader, writer);
2752 }
2753
2754 let range_info = detect_range_offset(&table);
2758 let range_const_info = if range_info.is_none() {
2759 detect_range_to_constant(&table)
2760 } else {
2761 None
2762 };
2763
2764 let mut buf = alloc_uninit_vec(STREAM_BUF);
2765 let mut last_squeezed: u16 = 256;
2766
2767 loop {
2768 let n = read_once(reader, &mut buf)?;
2769 if n == 0 {
2770 break;
2771 }
2772 let wp = translate_squeeze_process(
2773 &mut buf,
2774 n,
2775 &table,
2776 &squeeze_set,
2777 range_info,
2778 range_const_info,
2779 &mut last_squeezed,
2780 );
2781 if wp > 0 {
2782 writer.write_all(&buf[..wp])?;
2783 }
2784 }
2785 Ok(())
2786}
2787
2788#[inline]
2789fn translate_squeeze_process(
2790 buf: &mut [u8],
2791 n: usize,
2792 table: &[u8; 256],
2793 squeeze_set: &[u8; 32],
2794 range_info: Option<(u8, u8, i8)>,
2795 range_const_info: Option<(u8, u8, u8)>,
2796 last_squeezed: &mut u16,
2797) -> usize {
2798 if let Some((lo, hi, offset)) = range_info {
2800 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2801 } else if let Some((lo, hi, replacement)) = range_const_info {
2802 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2803 } else {
2804 translate_inplace(&mut buf[..n], table);
2805 }
2806 let mut wp = 0;
2808 unsafe {
2809 let ptr = buf.as_mut_ptr();
2810 let mut i = 0;
2811 while i + 8 <= n {
2812 macro_rules! squeeze_byte {
2813 ($off:expr) => {
2814 let b = *ptr.add(i + $off);
2815 if is_member(squeeze_set, b) {
2816 if *last_squeezed != b as u16 {
2817 *last_squeezed = b as u16;
2818 *ptr.add(wp) = b;
2819 wp += 1;
2820 }
2821 } else {
2822 *last_squeezed = 256;
2823 *ptr.add(wp) = b;
2824 wp += 1;
2825 }
2826 };
2827 }
2828 squeeze_byte!(0);
2829 squeeze_byte!(1);
2830 squeeze_byte!(2);
2831 squeeze_byte!(3);
2832 squeeze_byte!(4);
2833 squeeze_byte!(5);
2834 squeeze_byte!(6);
2835 squeeze_byte!(7);
2836 i += 8;
2837 }
2838 while i < n {
2839 let b = *ptr.add(i);
2840 if is_member(squeeze_set, b) {
2841 if *last_squeezed == b as u16 {
2842 i += 1;
2843 continue;
2844 }
2845 *last_squeezed = b as u16;
2846 } else {
2847 *last_squeezed = 256;
2848 }
2849 *ptr.add(wp) = b;
2850 wp += 1;
2851 i += 1;
2852 }
2853 }
2854 wp
2855}
2856
2857fn translate_squeeze_single_ch(
2861 table: &[u8; 256],
2862 squeeze_ch: u8,
2863 _squeeze_set: &[u8; 32],
2864 reader: &mut impl Read,
2865 writer: &mut impl Write,
2866) -> io::Result<()> {
2867 let range_info = detect_range_offset(table);
2868 let range_const_info = if range_info.is_none() {
2869 detect_range_to_constant(table)
2870 } else {
2871 None
2872 };
2873
2874 let pair = [squeeze_ch, squeeze_ch];
2875 let finder = memchr::memmem::Finder::new(&pair);
2876 let mut buf = alloc_uninit_vec(STREAM_BUF);
2877 let mut was_squeeze_char = false;
2878
2879 loop {
2880 let n = read_once(reader, &mut buf)?;
2881 if n == 0 {
2882 break;
2883 }
2884 let wp = translate_squeeze_single_process(
2885 &mut buf,
2886 n,
2887 table,
2888 squeeze_ch,
2889 &finder,
2890 range_info,
2891 range_const_info,
2892 &mut was_squeeze_char,
2893 );
2894 if wp > 0 {
2895 writer.write_all(&buf[..wp])?;
2896 }
2897 }
2898 Ok(())
2899}
2900
2901#[inline]
2902fn translate_squeeze_single_process(
2903 buf: &mut [u8],
2904 n: usize,
2905 table: &[u8; 256],
2906 squeeze_ch: u8,
2907 finder: &memchr::memmem::Finder<'_>,
2908 range_info: Option<(u8, u8, i8)>,
2909 range_const_info: Option<(u8, u8, u8)>,
2910 was_squeeze_char: &mut bool,
2911) -> usize {
2912 if let Some((lo, hi, offset)) = range_info {
2914 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2915 } else if let Some((lo, hi, replacement)) = range_const_info {
2916 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2917 } else {
2918 translate_inplace(&mut buf[..n], table);
2919 }
2920
2921 let mut i = 0;
2923 if *was_squeeze_char {
2924 while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2925 i += 1;
2926 }
2927 *was_squeeze_char = false;
2928 if i >= n {
2929 *was_squeeze_char = true;
2930 return 0;
2931 }
2932 }
2933
2934 let ptr = buf.as_mut_ptr();
2935 let mut wp = 0usize;
2936
2937 loop {
2938 match finder.find(&buf[i..n]) {
2939 Some(offset) => {
2940 let seg_end = i + offset + 1;
2941 let gap = seg_end - i;
2942 if gap > 0 {
2943 if wp != i {
2944 unsafe {
2945 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
2946 }
2947 }
2948 wp += gap;
2949 }
2950 i = seg_end;
2951 while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2952 i += 1;
2953 }
2954 if i >= n {
2955 *was_squeeze_char = true;
2956 break;
2957 }
2958 }
2959 None => {
2960 let rem = n - i;
2961 if rem > 0 {
2962 if wp != i {
2963 unsafe {
2964 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
2965 }
2966 }
2967 wp += rem;
2968 }
2969 *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == squeeze_ch;
2970 break;
2971 }
2972 }
2973 }
2974 wp
2975}
2976
2977pub fn delete(
2978 delete_chars: &[u8],
2979 reader: &mut impl Read,
2980 writer: &mut impl Write,
2981) -> io::Result<()> {
2982 if delete_chars.len() == 1 {
2983 return delete_single_streaming(delete_chars[0], reader, writer);
2984 }
2985 if delete_chars.len() <= 3 {
2986 return delete_multi_streaming(delete_chars, reader, writer);
2987 }
2988
2989 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
2993 return delete_range_streaming(lo, hi, reader, writer);
2994 }
2995
2996 let member = build_member_set(delete_chars);
2997 let mut buf = alloc_uninit_vec(STREAM_BUF);
2998 let mut outbuf = alloc_uninit_vec(STREAM_BUF);
3001
3002 loop {
3003 let n = read_once(reader, &mut buf)?;
3004 if n == 0 {
3005 break;
3006 }
3007 let wp = delete_bitset_dispatch(&buf[..n], &mut outbuf, &member);
3008 if wp > 0 {
3009 writer.write_all(&outbuf[..wp])?;
3010 }
3011 }
3012 Ok(())
3013}
3014
3015#[inline]
3016fn delete_bitset_dispatch(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3017 #[cfg(target_arch = "x86_64")]
3018 {
3019 if get_simd_level() >= 3 {
3020 return unsafe { delete_bitset_avx2_stream(src, dst, member) };
3021 }
3022 }
3023 delete_bitset_scalar(src, dst, member)
3024}
3025
3026#[inline]
3028fn delete_bitset_scalar(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3029 let n = src.len();
3030 let mut wp = 0;
3031 unsafe {
3032 let sp = src.as_ptr();
3033 let dp = dst.as_mut_ptr();
3034 let mut i = 0;
3035 while i + 8 <= n {
3036 let b0 = *sp.add(i);
3037 let b1 = *sp.add(i + 1);
3038 let b2 = *sp.add(i + 2);
3039 let b3 = *sp.add(i + 3);
3040 let b4 = *sp.add(i + 4);
3041 let b5 = *sp.add(i + 5);
3042 let b6 = *sp.add(i + 6);
3043 let b7 = *sp.add(i + 7);
3044 *dp.add(wp) = b0;
3045 wp += !is_member(member, b0) as usize;
3046 *dp.add(wp) = b1;
3047 wp += !is_member(member, b1) as usize;
3048 *dp.add(wp) = b2;
3049 wp += !is_member(member, b2) as usize;
3050 *dp.add(wp) = b3;
3051 wp += !is_member(member, b3) as usize;
3052 *dp.add(wp) = b4;
3053 wp += !is_member(member, b4) as usize;
3054 *dp.add(wp) = b5;
3055 wp += !is_member(member, b5) as usize;
3056 *dp.add(wp) = b6;
3057 wp += !is_member(member, b6) as usize;
3058 *dp.add(wp) = b7;
3059 wp += !is_member(member, b7) as usize;
3060 i += 8;
3061 }
3062 while i < n {
3063 let b = *sp.add(i);
3064 *dp.add(wp) = b;
3065 wp += !is_member(member, b) as usize;
3066 i += 1;
3067 }
3068 }
3069 wp
3070}
3071
3072#[cfg(target_arch = "x86_64")]
3075#[target_feature(enable = "avx2")]
3076unsafe fn delete_bitset_avx2_stream(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3077 use std::arch::x86_64::*;
3078
3079 unsafe {
3080 let n = src.len();
3081 let sp = src.as_ptr();
3082 let dp = dst.as_mut_ptr();
3083 let mut ri = 0;
3084 let mut wp = 0;
3085
3086 let member_v = _mm256_loadu_si256(member.as_ptr() as *const _);
3089
3090 let mask7 = _mm256_set1_epi8(7);
3093 let mask_0x1f = _mm256_set1_epi8(0x1F_u8 as i8);
3094
3095 let bit_table = _mm256_setr_epi8(
3098 1, 2, 4, 8, 16, 32, 64, -128i8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, -128i8,
3099 0, 0, 0, 0, 0, 0, 0, 0,
3100 );
3101
3102 while ri + 32 <= n {
3103 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
3104
3105 let byte_idx = _mm256_and_si256(_mm256_srli_epi16(input, 3), mask_0x1f);
3107 let bit_pos = _mm256_and_si256(input, mask7);
3109 let bit_mask = _mm256_shuffle_epi8(bit_table, bit_pos);
3111
3112 let member_lo = _mm256_broadcastsi128_si256(_mm256_castsi256_si128(member_v));
3120 let member_hi = _mm256_broadcastsi128_si256(_mm256_extracti128_si256(member_v, 1));
3121 let lo_mask = _mm256_set1_epi8(0x0F);
3122 let idx_lo = _mm256_and_si256(byte_idx, lo_mask);
3123 let shuffled_lo = _mm256_shuffle_epi8(member_lo, idx_lo);
3124 let shuffled_hi = _mm256_shuffle_epi8(member_hi, idx_lo);
3125 let use_hi = _mm256_slli_epi16(byte_idx, 3); let member_byte = _mm256_blendv_epi8(shuffled_lo, shuffled_hi, use_hi);
3128
3129 let test = _mm256_and_si256(member_byte, bit_mask);
3131 let is_zero = _mm256_cmpeq_epi8(test, _mm256_setzero_si256());
3132 let keep_mask = _mm256_movemask_epi8(is_zero) as u32;
3134
3135 if keep_mask == 0xFFFFFFFF {
3136 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
3138 wp += 32;
3139 } else if keep_mask != 0 {
3140 let m0 = keep_mask as u8;
3142 let m1 = (keep_mask >> 8) as u8;
3143 let m2 = (keep_mask >> 16) as u8;
3144 let m3 = (keep_mask >> 24) as u8;
3145
3146 if m0 == 0xFF {
3147 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
3148 } else if m0 != 0 {
3149 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
3150 }
3151 let c0 = m0.count_ones() as usize;
3152
3153 if m1 == 0xFF {
3154 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
3155 } else if m1 != 0 {
3156 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
3157 }
3158 let c1 = m1.count_ones() as usize;
3159
3160 if m2 == 0xFF {
3161 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
3162 } else if m2 != 0 {
3163 compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
3164 }
3165 let c2 = m2.count_ones() as usize;
3166
3167 if m3 == 0xFF {
3168 std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
3169 } else if m3 != 0 {
3170 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
3171 }
3172 let c3 = m3.count_ones() as usize;
3173 wp += c0 + c1 + c2 + c3;
3174 }
3175 ri += 32;
3177 }
3178
3179 while ri < n {
3181 let b = *sp.add(ri);
3182 *dp.add(wp) = b;
3183 wp += !is_member(member, b) as usize;
3184 ri += 1;
3185 }
3186
3187 wp
3188 }
3189}
3190
3191fn delete_single_streaming(
3192 ch: u8,
3193 reader: &mut impl Read,
3194 writer: &mut impl Write,
3195) -> io::Result<()> {
3196 let mut buf = alloc_uninit_vec(STREAM_BUF);
3197 loop {
3198 let n = read_once(reader, &mut buf)?;
3199 if n == 0 {
3200 break;
3201 }
3202 let wp = delete_single_inplace(&mut buf, n, ch);
3203 if wp > 0 {
3204 writer.write_all(&buf[..wp])?;
3205 }
3206 }
3207 Ok(())
3208}
3209
3210#[inline]
3212fn delete_single_inplace(buf: &mut [u8], n: usize, ch: u8) -> usize {
3213 let mut wp = 0;
3214 let mut i = 0;
3215 while i < n {
3216 match memchr::memchr(ch, &buf[i..n]) {
3217 Some(offset) => {
3218 if offset > 0 {
3219 if wp != i {
3220 unsafe {
3221 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3222 }
3223 }
3224 wp += offset;
3225 }
3226 i += offset + 1;
3227 }
3228 None => {
3229 let run_len = n - i;
3230 if run_len > 0 {
3231 if wp != i {
3232 unsafe {
3233 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3234 }
3235 }
3236 wp += run_len;
3237 }
3238 break;
3239 }
3240 }
3241 }
3242 wp
3243}
3244
3245fn delete_multi_streaming(
3246 chars: &[u8],
3247 reader: &mut impl Read,
3248 writer: &mut impl Write,
3249) -> io::Result<()> {
3250 let mut buf = alloc_uninit_vec(STREAM_BUF);
3251 loop {
3252 let n = read_once(reader, &mut buf)?;
3253 if n == 0 {
3254 break;
3255 }
3256 let wp = delete_multi_inplace(&mut buf, n, chars);
3257 if wp > 0 {
3258 writer.write_all(&buf[..wp])?;
3259 }
3260 }
3261 Ok(())
3262}
3263
3264#[inline]
3266fn delete_multi_inplace(buf: &mut [u8], n: usize, chars: &[u8]) -> usize {
3267 let mut wp = 0;
3268 let mut i = 0;
3269 while i < n {
3270 let found = if chars.len() == 2 {
3271 memchr::memchr2(chars[0], chars[1], &buf[i..n])
3272 } else {
3273 memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
3274 };
3275 match found {
3276 Some(offset) => {
3277 if offset > 0 {
3278 if wp != i {
3279 unsafe {
3280 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3281 }
3282 }
3283 wp += offset;
3284 }
3285 i += offset + 1;
3286 }
3287 None => {
3288 let run_len = n - i;
3289 if run_len > 0 {
3290 if wp != i {
3291 unsafe {
3292 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3293 }
3294 }
3295 wp += run_len;
3296 }
3297 break;
3298 }
3299 }
3300 }
3301 wp
3302}
3303
3304pub fn delete_squeeze(
3305 delete_chars: &[u8],
3306 squeeze_chars: &[u8],
3307 reader: &mut impl Read,
3308 writer: &mut impl Write,
3309) -> io::Result<()> {
3310 let delete_set = build_member_set(delete_chars);
3311 let squeeze_set = build_member_set(squeeze_chars);
3312 let mut buf = alloc_uninit_vec(STREAM_BUF);
3313 let mut last_squeezed: u16 = 256;
3314
3315 loop {
3316 let n = read_once(reader, &mut buf)?;
3317 if n == 0 {
3318 break;
3319 }
3320 let wp = delete_squeeze_inplace(&mut buf, n, &delete_set, &squeeze_set, &mut last_squeezed);
3321 if wp > 0 {
3322 writer.write_all(&buf[..wp])?;
3323 }
3324 }
3325 Ok(())
3326}
3327
3328#[inline]
3329fn delete_squeeze_inplace(
3330 buf: &mut [u8],
3331 n: usize,
3332 delete_set: &[u8; 32],
3333 squeeze_set: &[u8; 32],
3334 last_squeezed: &mut u16,
3335) -> usize {
3336 let mut wp = 0;
3337 unsafe {
3338 let ptr = buf.as_mut_ptr();
3339 let mut i = 0;
3340 while i + 8 <= n {
3341 macro_rules! process_byte {
3342 ($off:expr) => {
3343 let b = *ptr.add(i + $off);
3344 if !is_member(delete_set, b) {
3345 if is_member(squeeze_set, b) {
3346 if *last_squeezed != b as u16 {
3347 *last_squeezed = b as u16;
3348 *ptr.add(wp) = b;
3349 wp += 1;
3350 }
3351 } else {
3352 *last_squeezed = 256;
3353 *ptr.add(wp) = b;
3354 wp += 1;
3355 }
3356 }
3357 };
3358 }
3359 process_byte!(0);
3360 process_byte!(1);
3361 process_byte!(2);
3362 process_byte!(3);
3363 process_byte!(4);
3364 process_byte!(5);
3365 process_byte!(6);
3366 process_byte!(7);
3367 i += 8;
3368 }
3369 while i < n {
3370 let b = *ptr.add(i);
3371 if !is_member(delete_set, b) {
3372 if is_member(squeeze_set, b) {
3373 if *last_squeezed != b as u16 {
3374 *last_squeezed = b as u16;
3375 *ptr.add(wp) = b;
3376 wp += 1;
3377 }
3378 } else {
3379 *last_squeezed = 256;
3380 *ptr.add(wp) = b;
3381 wp += 1;
3382 }
3383 }
3384 i += 1;
3385 }
3386 }
3387 wp
3388}
3389
3390pub fn squeeze(
3391 squeeze_chars: &[u8],
3392 reader: &mut impl Read,
3393 writer: &mut impl Write,
3394) -> io::Result<()> {
3395 if squeeze_chars.len() == 1 {
3396 return squeeze_single_stream(squeeze_chars[0], reader, writer);
3397 }
3398
3399 if squeeze_chars.len() <= 3 {
3402 return squeeze_multi_stream(squeeze_chars, reader, writer);
3403 }
3404
3405 let member = build_member_set(squeeze_chars);
3406 let mut buf = alloc_uninit_vec(STREAM_BUF);
3407 let mut last_squeezed: u16 = 256;
3408
3409 loop {
3410 let n = read_once(reader, &mut buf)?;
3411 if n == 0 {
3412 break;
3413 }
3414 let wp = squeeze_inplace_bitset(&mut buf, n, &member, &mut last_squeezed);
3415 if wp > 0 {
3416 writer.write_all(&buf[..wp])?;
3417 }
3418 }
3419 Ok(())
3420}
3421
3422#[inline]
3423fn squeeze_inplace_bitset(
3424 buf: &mut [u8],
3425 n: usize,
3426 member: &[u8; 32],
3427 last_squeezed: &mut u16,
3428) -> usize {
3429 let mut wp = 0;
3430 unsafe {
3431 let ptr = buf.as_mut_ptr();
3432 for i in 0..n {
3433 let b = *ptr.add(i);
3434 if is_member(member, b) {
3435 if *last_squeezed == b as u16 {
3436 continue;
3437 }
3438 *last_squeezed = b as u16;
3439 } else {
3440 *last_squeezed = 256;
3441 }
3442 *ptr.add(wp) = b;
3443 wp += 1;
3444 }
3445 }
3446 wp
3447}
3448
3449fn squeeze_multi_stream(
3453 chars: &[u8],
3454 reader: &mut impl Read,
3455 writer: &mut impl Write,
3456) -> io::Result<()> {
3457 let c0 = chars[0];
3458 let c1 = chars[1];
3459 let c2 = if chars.len() >= 3 {
3460 Some(chars[2])
3461 } else {
3462 None
3463 };
3464
3465 let mut buf = alloc_uninit_vec(STREAM_BUF);
3466 let mut last_squeezed: u16 = 256;
3467
3468 loop {
3469 let n = read_once(reader, &mut buf)?;
3470 if n == 0 {
3471 break;
3472 }
3473 let wp = squeeze_multi_compact(&mut buf, n, c0, c1, c2, &mut last_squeezed);
3474 if wp > 0 {
3475 writer.write_all(&buf[..wp])?;
3476 }
3477 }
3478 Ok(())
3479}
3480
3481#[inline]
3483fn squeeze_multi_compact(
3484 buf: &mut [u8],
3485 n: usize,
3486 c0: u8,
3487 c1: u8,
3488 c2: Option<u8>,
3489 last_squeezed: &mut u16,
3490) -> usize {
3491 let ptr = buf.as_mut_ptr();
3492 let mut wp = 0usize;
3493 let mut cursor = 0usize;
3494
3495 while cursor < n {
3496 let found = if let Some(c) = c2 {
3497 memchr::memchr3(c0, c1, c, &buf[cursor..n])
3498 } else {
3499 memchr::memchr2(c0, c1, &buf[cursor..n])
3500 };
3501 match found {
3502 Some(offset) => {
3503 let pos = cursor + offset;
3504 let b = unsafe { *ptr.add(pos) };
3505
3506 let gap = pos - cursor;
3507 if gap > 0 {
3508 if wp != cursor {
3509 unsafe {
3510 std::ptr::copy(ptr.add(cursor), ptr.add(wp), gap);
3511 }
3512 }
3513 wp += gap;
3514 *last_squeezed = 256;
3515 }
3516
3517 if *last_squeezed != b as u16 {
3518 unsafe { *ptr.add(wp) = b };
3519 wp += 1;
3520 *last_squeezed = b as u16;
3521 }
3522
3523 cursor = pos + 1;
3524 while cursor < n && unsafe { *ptr.add(cursor) } == b {
3525 cursor += 1;
3526 }
3527 }
3528 None => {
3529 let rem = n - cursor;
3530 if rem > 0 {
3531 if wp != cursor {
3532 unsafe {
3533 std::ptr::copy(ptr.add(cursor), ptr.add(wp), rem);
3534 }
3535 }
3536 wp += rem;
3537 *last_squeezed = 256;
3538 }
3539 break;
3540 }
3541 }
3542 }
3543 wp
3544}
3545
3546fn squeeze_single_stream(
3547 ch: u8,
3548 reader: &mut impl Read,
3549 writer: &mut impl Write,
3550) -> io::Result<()> {
3551 let mut buf = alloc_uninit_vec(STREAM_BUF);
3552 let mut was_squeeze_char = false;
3553
3554 #[cfg(target_arch = "x86_64")]
3556 if get_simd_level() >= 3 {
3557 loop {
3558 let n = read_once(reader, &mut buf)?;
3559 if n == 0 {
3560 break;
3561 }
3562 let wp = unsafe { squeeze_single_avx2_inplace(&mut buf, n, ch, &mut was_squeeze_char) };
3563 if wp > 0 {
3564 writer.write_all(&buf[..wp])?;
3565 }
3566 }
3567 return Ok(());
3568 }
3569
3570 let pair = [ch, ch];
3572 let finder = memchr::memmem::Finder::new(&pair);
3573 loop {
3574 let n = read_once(reader, &mut buf)?;
3575 if n == 0 {
3576 break;
3577 }
3578 let wp = squeeze_single_compact(&mut buf, n, ch, &finder, &mut was_squeeze_char);
3579 if wp > 0 {
3580 writer.write_all(&buf[..wp])?;
3581 }
3582 }
3583 Ok(())
3584}
3585
3586#[inline]
3588fn squeeze_single_compact(
3589 buf: &mut [u8],
3590 n: usize,
3591 ch: u8,
3592 finder: &memchr::memmem::Finder<'_>,
3593 was_squeeze_char: &mut bool,
3594) -> usize {
3595 let mut i = 0;
3596
3597 if *was_squeeze_char {
3599 while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3600 i += 1;
3601 }
3602 *was_squeeze_char = false;
3603 if i >= n {
3604 *was_squeeze_char = true;
3605 return 0;
3606 }
3607 }
3608
3609 let ptr = buf.as_mut_ptr();
3610 let mut wp = 0usize;
3611
3612 loop {
3613 match finder.find(&buf[i..n]) {
3614 Some(offset) => {
3615 let seg_end = i + offset + 1;
3616 let gap = seg_end - i;
3617 if gap > 0 {
3618 if wp != i {
3619 unsafe {
3620 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
3621 }
3622 }
3623 wp += gap;
3624 }
3625 i = seg_end;
3626 while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3627 i += 1;
3628 }
3629 if i >= n {
3630 *was_squeeze_char = true;
3631 break;
3632 }
3633 }
3634 None => {
3635 let rem = n - i;
3636 if rem > 0 {
3637 if wp != i {
3638 unsafe {
3639 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
3640 }
3641 }
3642 wp += rem;
3643 }
3644 *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == ch;
3645 break;
3646 }
3647 }
3648 }
3649 wp
3650}
3651
3652#[cfg(target_arch = "x86_64")]
3663#[target_feature(enable = "avx2")]
3664unsafe fn squeeze_single_avx2_inplace(
3665 buf: &mut [u8],
3666 n: usize,
3667 ch: u8,
3668 was_squeeze_char: &mut bool,
3669) -> usize {
3670 use std::arch::x86_64::*;
3671
3672 unsafe {
3673 let ch_v = _mm256_set1_epi8(ch as i8);
3674 let ptr = buf.as_mut_ptr();
3675 let mut ri = 0;
3676 let mut wp = 0;
3677 let mut carry: u32 = if *was_squeeze_char { 1 } else { 0 };
3678
3679 while ri + 32 <= n {
3680 let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
3681 let cmp = _mm256_cmpeq_epi8(input, ch_v);
3682 let sq_mask = _mm256_movemask_epi8(cmp) as u32;
3683
3684 let prev_sq_mask = (sq_mask << 1) | carry;
3687
3688 let remove_mask = sq_mask & prev_sq_mask;
3690
3691 carry = (sq_mask >> 31) & 1;
3693
3694 if remove_mask == 0 {
3695 if wp != ri {
3697 std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
3698 }
3699 wp += 32;
3700 } else if remove_mask != 0xFFFFFFFF {
3701 let keep_mask = !remove_mask;
3703 let m0 = keep_mask as u8;
3704 let m1 = (keep_mask >> 8) as u8;
3705 let m2 = (keep_mask >> 16) as u8;
3706 let m3 = (keep_mask >> 24) as u8;
3707
3708 if m0 == 0xFF {
3709 std::ptr::copy_nonoverlapping(ptr.add(ri), ptr.add(wp), 8);
3710 } else if m0 != 0 {
3711 compact_8bytes_simd(ptr.add(ri), ptr.add(wp), m0);
3712 }
3713 let c0 = m0.count_ones() as usize;
3714
3715 if m1 == 0xFF {
3716 std::ptr::copy_nonoverlapping(ptr.add(ri + 8), ptr.add(wp + c0), 8);
3717 } else if m1 != 0 {
3718 compact_8bytes_simd(ptr.add(ri + 8), ptr.add(wp + c0), m1);
3719 }
3720 let c1 = m1.count_ones() as usize;
3721
3722 if m2 == 0xFF {
3723 std::ptr::copy_nonoverlapping(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
3724 } else if m2 != 0 {
3725 compact_8bytes_simd(ptr.add(ri + 16), ptr.add(wp + c0 + c1), m2);
3726 }
3727 let c2 = m2.count_ones() as usize;
3728
3729 if m3 == 0xFF {
3730 std::ptr::copy_nonoverlapping(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
3731 } else if m3 != 0 {
3732 compact_8bytes_simd(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), m3);
3733 }
3734 let c3 = m3.count_ones() as usize;
3735 wp += c0 + c1 + c2 + c3;
3736 }
3737 ri += 32;
3740 }
3741
3742 if ri + 16 <= n {
3744 let ch_v128 = _mm_set1_epi8(ch as i8);
3745 let input = _mm_loadu_si128(ptr.add(ri) as *const _);
3746 let cmp = _mm_cmpeq_epi8(input, ch_v128);
3747 let sq_mask = _mm_movemask_epi8(cmp) as u32 & 0xFFFF;
3748 let prev_sq_mask = (sq_mask << 1) | carry;
3749 let remove_mask = sq_mask & prev_sq_mask;
3750 carry = (sq_mask >> 15) & 1;
3751
3752 if remove_mask == 0 {
3753 if wp != ri {
3754 std::ptr::copy(ptr.add(ri), ptr.add(wp), 16);
3755 }
3756 wp += 16;
3757 } else if remove_mask != 0xFFFF {
3758 let keep_mask = !remove_mask;
3759 let m0 = keep_mask as u8;
3760 let m1 = (keep_mask >> 8) as u8;
3761 if m0 == 0xFF {
3762 std::ptr::copy_nonoverlapping(ptr.add(ri), ptr.add(wp), 8);
3763 } else if m0 != 0 {
3764 compact_8bytes_simd(ptr.add(ri), ptr.add(wp), m0);
3765 }
3766 let c0 = m0.count_ones() as usize;
3767 if m1 == 0xFF {
3768 std::ptr::copy_nonoverlapping(ptr.add(ri + 8), ptr.add(wp + c0), 8);
3769 } else if m1 != 0 {
3770 compact_8bytes_simd(ptr.add(ri + 8), ptr.add(wp + c0), m1);
3771 }
3772 wp += c0 + m1.count_ones() as usize;
3773 }
3774 ri += 16;
3775 }
3776
3777 while ri < n {
3779 let b = *ptr.add(ri);
3780 if b == ch && carry != 0 {
3781 } else {
3783 *ptr.add(wp) = b;
3784 wp += 1;
3785 }
3786 carry = if b == ch { 1 } else { 0 };
3787 ri += 1;
3788 }
3789
3790 *was_squeeze_char = carry != 0;
3791 wp
3792 }
3793}
3794
3795pub fn translate_owned(
3803 set1: &[u8],
3804 set2: &[u8],
3805 data: &mut [u8],
3806 writer: &mut impl Write,
3807) -> io::Result<()> {
3808 let table = build_translate_table(set1, set2);
3809
3810 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3812 if is_identity {
3813 return writer.write_all(data);
3814 }
3815
3816 const OWNED_PARALLEL_MIN: usize = 64 * 1024 * 1024;
3820
3821 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3823 if data.len() >= OWNED_PARALLEL_MIN {
3824 let n_threads = rayon::current_num_threads().max(1);
3825 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3826 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3827 translate_range_simd_inplace(chunk, lo, hi, offset);
3828 });
3829 } else {
3830 translate_range_simd_inplace(data, lo, hi, offset);
3831 }
3832 return writer.write_all(data);
3833 }
3834
3835 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3837 if data.len() >= OWNED_PARALLEL_MIN {
3838 let n_threads = rayon::current_num_threads().max(1);
3839 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3840 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3841 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
3842 });
3843 } else {
3844 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3845 }
3846 return writer.write_all(data);
3847 }
3848
3849 if data.len() >= OWNED_PARALLEL_MIN {
3851 let n_threads = rayon::current_num_threads().max(1);
3852 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3853 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3854 translate_inplace(chunk, &table);
3855 });
3856 } else {
3857 translate_inplace(data, &table);
3858 }
3859 writer.write_all(data)
3860}
3861
3862pub fn translate_mmap(
3876 set1: &[u8],
3877 set2: &[u8],
3878 data: &[u8],
3879 writer: &mut impl Write,
3880) -> io::Result<()> {
3881 let table = build_translate_table(set1, set2);
3882
3883 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3885 if is_identity {
3886 return writer.write_all(data);
3887 }
3888
3889 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3891 return translate_mmap_range(data, writer, lo, hi, offset);
3892 }
3893
3894 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3896 return translate_mmap_range_to_constant(data, writer, lo, hi, replacement);
3897 }
3898
3899 translate_mmap_table(data, writer, &table)
3901}
3902
3903fn translate_mmap_range(
3905 data: &[u8],
3906 writer: &mut impl Write,
3907 lo: u8,
3908 hi: u8,
3909 offset: i8,
3910) -> io::Result<()> {
3911 if data.len() >= PARALLEL_THRESHOLD {
3913 let mut buf = alloc_uninit_vec(data.len());
3914 let n_threads = rayon::current_num_threads().max(1);
3915 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3916
3917 data.par_chunks(chunk_size)
3919 .zip(buf.par_chunks_mut(chunk_size))
3920 .for_each(|(src_chunk, dst_chunk)| {
3921 translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
3922 });
3923
3924 return writer.write_all(&buf);
3925 }
3926
3927 const CHUNK: usize = 256 * 1024;
3929 let buf_size = data.len().min(CHUNK);
3930 let mut buf = alloc_uninit_vec(buf_size);
3931 for chunk in data.chunks(CHUNK) {
3932 translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
3933 writer.write_all(&buf[..chunk.len()])?;
3934 }
3935 Ok(())
3936}
3937
3938fn translate_mmap_range_to_constant(
3941 data: &[u8],
3942 writer: &mut impl Write,
3943 lo: u8,
3944 hi: u8,
3945 replacement: u8,
3946) -> io::Result<()> {
3947 if data.len() >= PARALLEL_THRESHOLD {
3949 let mut buf = alloc_uninit_vec(data.len());
3950 let n_threads = rayon::current_num_threads().max(1);
3951 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3952
3953 data.par_chunks(chunk_size)
3955 .zip(buf.par_chunks_mut(chunk_size))
3956 .for_each(|(src_chunk, dst_chunk)| {
3957 dst_chunk[..src_chunk.len()].copy_from_slice(src_chunk);
3958 translate_range_to_constant_simd_inplace(
3959 &mut dst_chunk[..src_chunk.len()],
3960 lo,
3961 hi,
3962 replacement,
3963 );
3964 });
3965
3966 return writer.write_all(&buf);
3967 }
3968
3969 const CHUNK: usize = 256 * 1024;
3971 let buf_size = data.len().min(CHUNK);
3972 let mut buf = alloc_uninit_vec(buf_size);
3973 for chunk in data.chunks(CHUNK) {
3974 buf[..chunk.len()].copy_from_slice(chunk);
3975 translate_range_to_constant_simd_inplace(&mut buf[..chunk.len()], lo, hi, replacement);
3976 writer.write_all(&buf[..chunk.len()])?;
3977 }
3978 Ok(())
3979}
3980
3981fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
3983 if data.len() >= PARALLEL_THRESHOLD {
3985 let mut buf = alloc_uninit_vec(data.len());
3986 let n_threads = rayon::current_num_threads().max(1);
3987 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3988
3989 data.par_chunks(chunk_size)
3990 .zip(buf.par_chunks_mut(chunk_size))
3991 .for_each(|(src_chunk, dst_chunk)| {
3992 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
3993 });
3994
3995 return writer.write_all(&buf);
3996 }
3997
3998 const CHUNK: usize = 256 * 1024;
4000 let buf_size = data.len().min(CHUNK);
4001 let mut buf = alloc_uninit_vec(buf_size);
4002 for chunk in data.chunks(CHUNK) {
4003 translate_to(chunk, &mut buf[..chunk.len()], table);
4004 writer.write_all(&buf[..chunk.len()])?;
4005 }
4006 Ok(())
4007}
4008
4009pub fn translate_mmap_inplace(
4016 set1: &[u8],
4017 set2: &[u8],
4018 data: &mut [u8],
4019 writer: &mut impl Write,
4020) -> io::Result<()> {
4021 let table = build_translate_table(set1, set2);
4022
4023 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
4025 if is_identity {
4026 return writer.write_all(data);
4027 }
4028
4029 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
4036 if data.len() >= PARALLEL_THRESHOLD {
4037 let n_threads = rayon::current_num_threads().max(1);
4038 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4039 data.par_chunks_mut(chunk_size)
4040 .for_each(|chunk| translate_range_simd_inplace(chunk, lo, hi, offset));
4041 } else {
4042 translate_range_simd_inplace(data, lo, hi, offset);
4043 }
4044 return writer.write_all(data);
4045 }
4046
4047 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
4049 if data.len() >= PARALLEL_THRESHOLD {
4050 let n_threads = rayon::current_num_threads().max(1);
4051 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4052 data.par_chunks_mut(chunk_size).for_each(|chunk| {
4053 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement)
4054 });
4055 } else {
4056 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
4057 }
4058 return writer.write_all(data);
4059 }
4060
4061 if data.len() >= PARALLEL_THRESHOLD {
4063 let n_threads = rayon::current_num_threads().max(1);
4064 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4065 data.par_chunks_mut(chunk_size)
4066 .for_each(|chunk| translate_inplace(chunk, &table));
4067 } else {
4068 translate_inplace(data, &table);
4069 }
4070 writer.write_all(data)
4071}
4072
4073fn translate_to_separate_buf(
4081 data: &[u8],
4082 table: &[u8; 256],
4083 writer: &mut impl Write,
4084) -> io::Result<()> {
4085 let range_info = detect_range_offset(table);
4086 let const_info = if range_info.is_none() {
4087 detect_range_to_constant(table)
4088 } else {
4089 None
4090 };
4091
4092 if data.len() >= PARALLEL_THRESHOLD {
4093 let mut out_buf = alloc_uninit_vec(data.len());
4095 let n_threads = rayon::current_num_threads().max(1);
4096 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4097
4098 if let Some((lo, hi, offset)) = range_info {
4099 data.par_chunks(chunk_size)
4100 .zip(out_buf.par_chunks_mut(chunk_size))
4101 .for_each(|(src, dst)| {
4102 translate_range_simd(src, &mut dst[..src.len()], lo, hi, offset);
4103 });
4104 } else if let Some((lo, hi, replacement)) = const_info {
4105 data.par_chunks(chunk_size)
4106 .zip(out_buf.par_chunks_mut(chunk_size))
4107 .for_each(|(src, dst)| {
4108 translate_range_to_constant_simd(
4109 src,
4110 &mut dst[..src.len()],
4111 lo,
4112 hi,
4113 replacement,
4114 );
4115 });
4116 } else {
4117 data.par_chunks(chunk_size)
4118 .zip(out_buf.par_chunks_mut(chunk_size))
4119 .for_each(|(src, dst)| {
4120 translate_to(src, &mut dst[..src.len()], table);
4121 });
4122 }
4123 return writer.write_all(&out_buf);
4124 }
4125
4126 let mut out_buf = alloc_uninit_vec(data.len());
4132 if let Some((lo, hi, offset)) = range_info {
4133 translate_range_simd(data, &mut out_buf, lo, hi, offset);
4134 } else if let Some((lo, hi, replacement)) = const_info {
4135 translate_range_to_constant_simd(data, &mut out_buf, lo, hi, replacement);
4136 } else {
4137 translate_to(data, &mut out_buf, table);
4138 }
4139 writer.write_all(&out_buf)
4140}
4141
4142pub fn translate_mmap_readonly(
4146 set1: &[u8],
4147 set2: &[u8],
4148 data: &[u8],
4149 writer: &mut impl Write,
4150) -> io::Result<()> {
4151 let table = build_translate_table(set1, set2);
4152
4153 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
4155 if is_identity {
4156 return writer.write_all(data);
4157 }
4158
4159 translate_to_separate_buf(data, &table, writer)
4160}
4161
4162pub fn translate_squeeze_mmap(
4168 set1: &[u8],
4169 set2: &[u8],
4170 data: &[u8],
4171 writer: &mut impl Write,
4172) -> io::Result<()> {
4173 let table = build_translate_table(set1, set2);
4174 let squeeze_set = build_member_set(set2);
4175
4176 if data.len() >= PARALLEL_THRESHOLD {
4181 let mut translated = alloc_uninit_vec(data.len());
4183 let range_info = detect_range_offset(&table);
4184 let n_threads = rayon::current_num_threads().max(1);
4185 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4186
4187 if let Some((lo, hi, offset)) = range_info {
4188 data.par_chunks(chunk_size)
4189 .zip(translated.par_chunks_mut(chunk_size))
4190 .for_each(|(src_chunk, dst_chunk)| {
4191 translate_range_simd(
4192 src_chunk,
4193 &mut dst_chunk[..src_chunk.len()],
4194 lo,
4195 hi,
4196 offset,
4197 );
4198 });
4199 } else {
4200 data.par_chunks(chunk_size)
4201 .zip(translated.par_chunks_mut(chunk_size))
4202 .for_each(|(src_chunk, dst_chunk)| {
4203 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
4204 });
4205 }
4206
4207 let mut last_squeezed: u16 = 256;
4211 let len = translated.len();
4212 let mut wp = 0;
4213 unsafe {
4214 let ptr = translated.as_mut_ptr();
4215 let mut i = 0;
4216 while i < len {
4217 let b = *ptr.add(i);
4218 if is_member(&squeeze_set, b) {
4219 if last_squeezed == b as u16 {
4220 i += 1;
4221 continue;
4222 }
4223 last_squeezed = b as u16;
4224 } else {
4225 last_squeezed = 256;
4226 }
4227 *ptr.add(wp) = b;
4228 wp += 1;
4229 i += 1;
4230 }
4231 }
4232 return writer.write_all(&translated[..wp]);
4233 }
4234
4235 let mut buf = alloc_uninit_vec(data.len());
4238 translate_to(data, &mut buf, &table);
4239 let mut last_squeezed: u16 = 256;
4240 let mut wp = 0;
4241 unsafe {
4242 let ptr = buf.as_mut_ptr();
4243 for i in 0..data.len() {
4244 let b = *ptr.add(i);
4245 if is_member(&squeeze_set, b) {
4246 if last_squeezed == b as u16 {
4247 continue;
4248 }
4249 last_squeezed = b as u16;
4250 } else {
4251 last_squeezed = 256;
4252 }
4253 *ptr.add(wp) = b;
4254 wp += 1;
4255 }
4256 }
4257 writer.write_all(&buf[..wp])
4258}
4259
4260pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4266 if delete_chars.len() == 1 {
4267 return delete_single_char_mmap(delete_chars[0], data, writer);
4268 }
4269 if delete_chars.len() <= 3 {
4270 return delete_multi_memchr_mmap(delete_chars, data, writer);
4271 }
4272
4273 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
4275 return delete_range_mmap(data, writer, lo, hi);
4276 }
4277
4278 let member = build_member_set(delete_chars);
4279
4280 let sample_size = data.len().min(1024);
4285 let sample_deletes = data[..sample_size]
4286 .iter()
4287 .filter(|&&b| is_member(&member, b))
4288 .count();
4289 let estimated_deletes = if sample_size > 0 {
4290 data.len() * sample_deletes / sample_size
4291 } else {
4292 data.len()
4293 };
4294
4295 if estimated_deletes < MAX_IOV / 2 {
4296 return delete_bitset_zerocopy(data, &member, writer);
4297 }
4298
4299 if data.len() >= PARALLEL_THRESHOLD {
4301 let n_threads = rayon::current_num_threads().max(1);
4302 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4303
4304 let mut outbuf = alloc_uninit_vec(data.len());
4305 let chunk_lens: Vec<usize> = data
4306 .par_chunks(chunk_size)
4307 .zip(outbuf.par_chunks_mut(chunk_size))
4308 .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
4309 .collect();
4310
4311 let slices: Vec<std::io::IoSlice> = chunk_lens
4315 .iter()
4316 .enumerate()
4317 .filter(|&(_, &len)| len > 0)
4318 .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4319 .collect();
4320 return write_ioslices(writer, &slices);
4321 }
4322
4323 const COMPACT_BUF: usize = 256 * 1024;
4326 let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4327
4328 for chunk in data.chunks(COMPACT_BUF) {
4329 let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
4330 if out_pos > 0 {
4331 writer.write_all(&outbuf[..out_pos])?;
4332 }
4333 }
4334 Ok(())
4335}
4336
4337fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
4342 let sample_size = data.len().min(1024);
4344 let sample_deletes = data[..sample_size]
4345 .iter()
4346 .filter(|&&b| b >= lo && b <= hi)
4347 .count();
4348 let estimated_deletes = if sample_size > 0 {
4354 data.len() * sample_deletes / sample_size
4355 } else {
4356 data.len()
4357 };
4358 if estimated_deletes < MAX_IOV / 2 {
4359 return delete_range_mmap_zerocopy(data, writer, lo, hi);
4360 }
4361
4362 if data.len() >= PARALLEL_THRESHOLD {
4364 let n_threads = rayon::current_num_threads().max(1);
4365 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4366
4367 let mut outbuf = alloc_uninit_vec(data.len());
4368 let chunk_lens: Vec<usize> = data
4369 .par_chunks(chunk_size)
4370 .zip(outbuf.par_chunks_mut(chunk_size))
4371 .map(|(src_chunk, dst_chunk)| delete_range_chunk(src_chunk, dst_chunk, lo, hi))
4372 .collect();
4373
4374 let slices: Vec<std::io::IoSlice> = chunk_lens
4377 .iter()
4378 .enumerate()
4379 .filter(|&(_, &len)| len > 0)
4380 .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4381 .collect();
4382 return write_ioslices(writer, &slices);
4383 }
4384
4385 const COMPACT_BUF: usize = 256 * 1024;
4389 let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4390
4391 #[cfg(target_arch = "x86_64")]
4392 {
4393 let mut wp = 0;
4394 let level = get_simd_level();
4395 let len = data.len();
4396 let sp = data.as_ptr();
4397 let dp = outbuf.as_mut_ptr();
4398 let mut ri = 0;
4399
4400 if level >= 3 {
4401 use std::arch::x86_64::*;
4402 let range = hi - lo;
4403 let bias_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8) };
4404 let threshold_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8) };
4405 let zero = unsafe { _mm256_setzero_si256() };
4406
4407 while ri + 32 <= len {
4408 if wp + 32 > COMPACT_BUF {
4410 writer.write_all(&outbuf[..wp])?;
4411 wp = 0;
4412 }
4413
4414 let input = unsafe { _mm256_loadu_si256(sp.add(ri) as *const _) };
4415 let biased = unsafe { _mm256_add_epi8(input, bias_v) };
4416 let gt = unsafe { _mm256_cmpgt_epi8(biased, threshold_v) };
4417 let in_range = unsafe { _mm256_cmpeq_epi8(gt, zero) };
4418 let keep_mask = !(unsafe { _mm256_movemask_epi8(in_range) } as u32);
4419
4420 if keep_mask == 0xFFFFFFFF {
4421 unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32) };
4422 wp += 32;
4423 } else if keep_mask != 0 {
4424 let m0 = keep_mask as u8;
4425 let m1 = (keep_mask >> 8) as u8;
4426 let m2 = (keep_mask >> 16) as u8;
4427 let m3 = (keep_mask >> 24) as u8;
4428
4429 if m0 == 0xFF {
4430 unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8) };
4431 } else if m0 != 0 {
4432 unsafe { compact_8bytes_simd(sp.add(ri), dp.add(wp), m0) };
4433 }
4434 let c0 = m0.count_ones() as usize;
4435
4436 if m1 == 0xFF {
4437 unsafe {
4438 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8)
4439 };
4440 } else if m1 != 0 {
4441 unsafe { compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1) };
4442 }
4443 let c1 = m1.count_ones() as usize;
4444
4445 if m2 == 0xFF {
4446 unsafe {
4447 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8)
4448 };
4449 } else if m2 != 0 {
4450 unsafe { compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2) };
4451 }
4452 let c2 = m2.count_ones() as usize;
4453
4454 if m3 == 0xFF {
4455 unsafe {
4456 std::ptr::copy_nonoverlapping(
4457 sp.add(ri + 24),
4458 dp.add(wp + c0 + c1 + c2),
4459 8,
4460 )
4461 };
4462 } else if m3 != 0 {
4463 unsafe {
4464 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3)
4465 };
4466 }
4467 let c3 = m3.count_ones() as usize;
4468 wp += c0 + c1 + c2 + c3;
4469 }
4470 ri += 32;
4471 }
4472 }
4473
4474 while ri < len {
4476 if wp + 1 > COMPACT_BUF {
4477 writer.write_all(&outbuf[..wp])?;
4478 wp = 0;
4479 }
4480 let b = unsafe { *sp.add(ri) };
4481 unsafe { *dp.add(wp) = b };
4482 wp += (b < lo || b > hi) as usize;
4483 ri += 1;
4484 }
4485
4486 if wp > 0 {
4487 writer.write_all(&outbuf[..wp])?;
4488 }
4489 return Ok(());
4490 }
4491
4492 #[cfg(not(target_arch = "x86_64"))]
4493 {
4494 for chunk in data.chunks(COMPACT_BUF) {
4496 let clen = delete_range_chunk(chunk, &mut outbuf, lo, hi);
4497 if clen > 0 {
4498 writer.write_all(&outbuf[..clen])?;
4499 }
4500 }
4501 return Ok(());
4502 }
4503
4504 #[allow(unreachable_code)]
4505 Ok(())
4506}
4507
4508fn delete_range_mmap_zerocopy(
4513 data: &[u8],
4514 writer: &mut impl Write,
4515 lo: u8,
4516 hi: u8,
4517) -> io::Result<()> {
4518 #[cfg(target_arch = "x86_64")]
4519 {
4520 if get_simd_level() >= 3 {
4521 return unsafe { delete_range_zerocopy_avx2(data, writer, lo, hi) };
4522 }
4523 if get_simd_level() >= 2 {
4524 return unsafe { delete_range_zerocopy_sse2(data, writer, lo, hi) };
4525 }
4526 }
4527
4528 #[cfg(target_arch = "aarch64")]
4529 {
4530 return unsafe { delete_range_zerocopy_neon(data, writer, lo, hi) };
4531 }
4532
4533 #[allow(unreachable_code)]
4535 delete_range_zerocopy_scalar(data, writer, lo, hi)
4536}
4537
4538fn delete_range_zerocopy_scalar(
4541 data: &[u8],
4542 writer: &mut impl Write,
4543 lo: u8,
4544 hi: u8,
4545) -> io::Result<()> {
4546 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4547 let len = data.len();
4548 let mut run_start: usize = 0;
4549 let mut i: usize = 0;
4550
4551 while i < len {
4552 let b = unsafe { *data.get_unchecked(i) };
4553 if b >= lo && b <= hi {
4554 if i > run_start {
4555 iov.push(std::io::IoSlice::new(&data[run_start..i]));
4556 if iov.len() >= MAX_IOV {
4557 write_ioslices(writer, &iov)?;
4558 iov.clear();
4559 }
4560 }
4561 run_start = i + 1;
4562 }
4563 i += 1;
4564 }
4565 if run_start < len {
4566 iov.push(std::io::IoSlice::new(&data[run_start..]));
4567 }
4568 if !iov.is_empty() {
4569 write_ioslices(writer, &iov)?;
4570 }
4571 Ok(())
4572}
4573
4574#[cfg(target_arch = "x86_64")]
4578#[target_feature(enable = "avx2")]
4579unsafe fn delete_range_zerocopy_avx2(
4580 data: &[u8],
4581 writer: &mut impl Write,
4582 lo: u8,
4583 hi: u8,
4584) -> io::Result<()> {
4585 use std::arch::x86_64::*;
4586
4587 unsafe {
4588 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4589 let len = data.len();
4590 let mut run_start: usize = 0;
4591 let mut ri: usize = 0;
4592
4593 let range = hi - lo;
4594 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4595 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
4596 let zero = _mm256_setzero_si256();
4597
4598 while ri + 32 <= len {
4599 let input = _mm256_loadu_si256(data.as_ptr().add(ri) as *const _);
4600 let biased = _mm256_add_epi8(input, bias_v);
4601 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
4602 let in_range = _mm256_cmpeq_epi8(gt, zero);
4603 let del_mask = _mm256_movemask_epi8(in_range) as u32;
4604
4605 if del_mask == 0 {
4606 ri += 32;
4608 continue;
4609 }
4610
4611 let mut m = del_mask;
4613 while m != 0 {
4614 let bit = m.trailing_zeros() as usize;
4615 let abs_pos = ri + bit;
4616 if abs_pos > run_start {
4617 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4618 if iov.len() >= MAX_IOV {
4619 write_ioslices(writer, &iov)?;
4620 iov.clear();
4621 }
4622 }
4623 run_start = abs_pos + 1;
4624 m &= m - 1; }
4626
4627 ri += 32;
4628 }
4629
4630 while ri < len {
4632 let b = *data.get_unchecked(ri);
4633 if b >= lo && b <= hi {
4634 if ri > run_start {
4635 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4636 if iov.len() >= MAX_IOV {
4637 write_ioslices(writer, &iov)?;
4638 iov.clear();
4639 }
4640 }
4641 run_start = ri + 1;
4642 }
4643 ri += 1;
4644 }
4645
4646 if run_start < len {
4647 iov.push(std::io::IoSlice::new(&data[run_start..]));
4648 }
4649 if !iov.is_empty() {
4650 write_ioslices(writer, &iov)?;
4651 }
4652 Ok(())
4653 }
4654}
4655
4656#[cfg(target_arch = "x86_64")]
4658#[target_feature(enable = "sse2")]
4659unsafe fn delete_range_zerocopy_sse2(
4660 data: &[u8],
4661 writer: &mut impl Write,
4662 lo: u8,
4663 hi: u8,
4664) -> io::Result<()> {
4665 use std::arch::x86_64::*;
4666
4667 unsafe {
4668 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4669 let len = data.len();
4670 let mut run_start: usize = 0;
4671 let mut ri: usize = 0;
4672
4673 let range = hi - lo;
4674 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4675 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
4676 let zero = _mm_setzero_si128();
4677
4678 while ri + 16 <= len {
4679 let input = _mm_loadu_si128(data.as_ptr().add(ri) as *const _);
4680 let biased = _mm_add_epi8(input, bias_v);
4681 let gt = _mm_cmpgt_epi8(biased, threshold_v);
4682 let in_range = _mm_cmpeq_epi8(gt, zero);
4683 let del_mask = _mm_movemask_epi8(in_range) as u32 & 0xFFFF;
4684
4685 if del_mask == 0 {
4686 ri += 16;
4687 continue;
4688 }
4689
4690 let mut m = del_mask;
4691 while m != 0 {
4692 let bit = m.trailing_zeros() as usize;
4693 let abs_pos = ri + bit;
4694 if abs_pos > run_start {
4695 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4696 if iov.len() >= MAX_IOV {
4697 write_ioslices(writer, &iov)?;
4698 iov.clear();
4699 }
4700 }
4701 run_start = abs_pos + 1;
4702 m &= m - 1;
4703 }
4704
4705 ri += 16;
4706 }
4707
4708 while ri < len {
4709 let b = *data.get_unchecked(ri);
4710 if b >= lo && b <= hi {
4711 if ri > run_start {
4712 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4713 if iov.len() >= MAX_IOV {
4714 write_ioslices(writer, &iov)?;
4715 iov.clear();
4716 }
4717 }
4718 run_start = ri + 1;
4719 }
4720 ri += 1;
4721 }
4722
4723 if run_start < len {
4724 iov.push(std::io::IoSlice::new(&data[run_start..]));
4725 }
4726 if !iov.is_empty() {
4727 write_ioslices(writer, &iov)?;
4728 }
4729 Ok(())
4730 }
4731}
4732
4733#[cfg(target_arch = "aarch64")]
4737#[target_feature(enable = "neon")]
4738unsafe fn delete_range_zerocopy_neon(
4739 data: &[u8],
4740 writer: &mut impl Write,
4741 lo: u8,
4742 hi: u8,
4743) -> io::Result<()> {
4744 use std::arch::aarch64::*;
4745
4746 unsafe {
4747 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4748 let len = data.len();
4749 let mut run_start: usize = 0;
4750 let mut ri: usize = 0;
4751
4752 let lo_v = vdupq_n_u8(lo);
4753 let hi_v = vdupq_n_u8(hi);
4754 let bit_mask: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
4756 let bit_mask_v = vld1q_u8(bit_mask.as_ptr());
4757
4758 while ri + 16 <= len {
4759 let input = vld1q_u8(data.as_ptr().add(ri));
4760 let ge_lo = vcgeq_u8(input, lo_v);
4762 let le_hi = vcleq_u8(input, hi_v);
4763 let in_range = vandq_u8(ge_lo, le_hi);
4764
4765 let bits = vandq_u8(in_range, bit_mask_v);
4767 let pair = vpaddlq_u8(bits); let quad = vpaddlq_u16(pair); let octet = vpaddlq_u32(quad); let mask_lo = vgetq_lane_u64::<0>(octet) as u8;
4771 let mask_hi = vgetq_lane_u64::<1>(octet) as u8;
4772 let del_mask = (mask_hi as u16) << 8 | mask_lo as u16;
4773
4774 if del_mask == 0 {
4775 ri += 16;
4777 continue;
4778 }
4779
4780 let mut m = del_mask;
4782 while m != 0 {
4783 let bit = m.trailing_zeros() as usize;
4784 let abs_pos = ri + bit;
4785 if abs_pos > run_start {
4786 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4787 if iov.len() >= MAX_IOV {
4788 write_ioslices(writer, &iov)?;
4789 iov.clear();
4790 }
4791 }
4792 run_start = abs_pos + 1;
4793 m &= m - 1;
4794 }
4795
4796 ri += 16;
4797 }
4798
4799 while ri < len {
4801 let b = *data.get_unchecked(ri);
4802 if b >= lo && b <= hi {
4803 if ri > run_start {
4804 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4805 if iov.len() >= MAX_IOV {
4806 write_ioslices(writer, &iov)?;
4807 iov.clear();
4808 }
4809 }
4810 run_start = ri + 1;
4811 }
4812 ri += 1;
4813 }
4814
4815 if run_start < len {
4816 iov.push(std::io::IoSlice::new(&data[run_start..]));
4817 }
4818 if !iov.is_empty() {
4819 write_ioslices(writer, &iov)?;
4820 }
4821 Ok(())
4822 }
4823}
4824
4825#[inline]
4828fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
4829 let len = chunk.len();
4830 let mut out_pos = 0;
4831 let mut i = 0;
4832
4833 while i + 8 <= len {
4834 unsafe {
4835 let b0 = *chunk.get_unchecked(i);
4836 let b1 = *chunk.get_unchecked(i + 1);
4837 let b2 = *chunk.get_unchecked(i + 2);
4838 let b3 = *chunk.get_unchecked(i + 3);
4839 let b4 = *chunk.get_unchecked(i + 4);
4840 let b5 = *chunk.get_unchecked(i + 5);
4841 let b6 = *chunk.get_unchecked(i + 6);
4842 let b7 = *chunk.get_unchecked(i + 7);
4843
4844 *outbuf.get_unchecked_mut(out_pos) = b0;
4845 out_pos += !is_member(member, b0) as usize;
4846 *outbuf.get_unchecked_mut(out_pos) = b1;
4847 out_pos += !is_member(member, b1) as usize;
4848 *outbuf.get_unchecked_mut(out_pos) = b2;
4849 out_pos += !is_member(member, b2) as usize;
4850 *outbuf.get_unchecked_mut(out_pos) = b3;
4851 out_pos += !is_member(member, b3) as usize;
4852 *outbuf.get_unchecked_mut(out_pos) = b4;
4853 out_pos += !is_member(member, b4) as usize;
4854 *outbuf.get_unchecked_mut(out_pos) = b5;
4855 out_pos += !is_member(member, b5) as usize;
4856 *outbuf.get_unchecked_mut(out_pos) = b6;
4857 out_pos += !is_member(member, b6) as usize;
4858 *outbuf.get_unchecked_mut(out_pos) = b7;
4859 out_pos += !is_member(member, b7) as usize;
4860 }
4861 i += 8;
4862 }
4863
4864 while i < len {
4865 unsafe {
4866 let b = *chunk.get_unchecked(i);
4867 *outbuf.get_unchecked_mut(out_pos) = b;
4868 out_pos += !is_member(member, b) as usize;
4869 }
4870 i += 1;
4871 }
4872
4873 out_pos
4874}
4875
4876fn delete_bitset_zerocopy(
4881 data: &[u8],
4882 member: &[u8; 32],
4883 writer: &mut impl Write,
4884) -> io::Result<()> {
4885 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4886 let len = data.len();
4887 let mut i = 0;
4888 let mut run_start: Option<usize> = None;
4889
4890 while i < len {
4891 let b = unsafe { *data.get_unchecked(i) };
4892 if is_member(member, b) {
4893 if let Some(rs) = run_start {
4895 iov.push(std::io::IoSlice::new(&data[rs..i]));
4896 run_start = None;
4897 if iov.len() >= MAX_IOV {
4898 write_ioslices(writer, &iov)?;
4899 iov.clear();
4900 }
4901 }
4902 } else {
4903 if run_start.is_none() {
4905 run_start = Some(i);
4906 }
4907 }
4908 i += 1;
4909 }
4910 if let Some(rs) = run_start {
4912 iov.push(std::io::IoSlice::new(&data[rs..]));
4913 }
4914 if !iov.is_empty() {
4915 write_ioslices(writer, &iov)?;
4916 }
4917 Ok(())
4918}
4919
4920fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4921 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4925 let mut last = 0;
4926 for pos in memchr::memchr_iter(ch, data) {
4927 if pos > last {
4928 iov.push(std::io::IoSlice::new(&data[last..pos]));
4929 if iov.len() >= MAX_IOV {
4930 write_ioslices(writer, &iov)?;
4931 iov.clear();
4932 }
4933 }
4934 last = pos + 1;
4935 }
4936 if last < data.len() {
4937 iov.push(std::io::IoSlice::new(&data[last..]));
4938 }
4939 if !iov.is_empty() {
4940 write_ioslices(writer, &iov)?;
4941 }
4942 Ok(())
4943}
4944
4945fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4946 let c0 = chars[0];
4947 let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
4948 let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
4949 let is_three = chars.len() >= 3;
4950
4951 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4953 let mut last = 0;
4954
4955 macro_rules! process_pos {
4956 ($pos:expr) => {
4957 if $pos > last {
4958 iov.push(std::io::IoSlice::new(&data[last..$pos]));
4959 if iov.len() >= MAX_IOV {
4960 write_ioslices(writer, &iov)?;
4961 iov.clear();
4962 }
4963 }
4964 last = $pos + 1;
4965 };
4966 }
4967
4968 if is_three {
4969 for pos in memchr::memchr3_iter(c0, c1, c2, data) {
4970 process_pos!(pos);
4971 }
4972 } else {
4973 for pos in memchr::memchr2_iter(c0, c1, data) {
4974 process_pos!(pos);
4975 }
4976 }
4977 if last < data.len() {
4978 iov.push(std::io::IoSlice::new(&data[last..]));
4979 }
4980 if !iov.is_empty() {
4981 write_ioslices(writer, &iov)?;
4982 }
4983 Ok(())
4984}
4985
4986pub fn delete_squeeze_mmap(
4991 delete_chars: &[u8],
4992 squeeze_chars: &[u8],
4993 data: &[u8],
4994 writer: &mut impl Write,
4995) -> io::Result<()> {
4996 let delete_set = build_member_set(delete_chars);
4997 let squeeze_set = build_member_set(squeeze_chars);
4998
4999 let mut outbuf = alloc_uninit_vec(data.len());
5001 let mut last_squeezed: u16 = 256;
5002 let mut out_pos = 0;
5003
5004 for &b in data.iter() {
5005 if is_member(&delete_set, b) {
5006 continue;
5007 }
5008 if is_member(&squeeze_set, b) {
5009 if last_squeezed == b as u16 {
5010 continue;
5011 }
5012 last_squeezed = b as u16;
5013 } else {
5014 last_squeezed = 256;
5015 }
5016 unsafe {
5017 *outbuf.get_unchecked_mut(out_pos) = b;
5018 }
5019 out_pos += 1;
5020 }
5021 writer.write_all(&outbuf[..out_pos])
5022}
5023
5024pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
5030 if squeeze_chars.len() == 1 {
5031 return squeeze_single_mmap(squeeze_chars[0], data, writer);
5032 }
5033 if squeeze_chars.len() == 2 {
5034 return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
5035 }
5036 if squeeze_chars.len() == 3 {
5037 return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
5038 }
5039
5040 let member = build_member_set(squeeze_chars);
5041
5042 if data.len() >= PARALLEL_THRESHOLD {
5044 let n_threads = rayon::current_num_threads().max(1);
5045 let chunk_size = (data.len() / n_threads).max(32 * 1024);
5046
5047 let results: Vec<Vec<u8>> = data
5048 .par_chunks(chunk_size)
5049 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
5050 .collect();
5051
5052 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
5057 for (idx, result) in results.iter().enumerate() {
5058 if result.is_empty() {
5059 continue;
5060 }
5061 if idx > 0 {
5062 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
5064 if is_member(&member, prev_last) {
5065 let skip = result.iter().take_while(|&&b| b == prev_last).count();
5067 if skip < result.len() {
5068 slices.push(std::io::IoSlice::new(&result[skip..]));
5069 }
5070 continue;
5071 }
5072 }
5073 }
5074 slices.push(std::io::IoSlice::new(result));
5075 }
5076 return write_ioslices(writer, &slices);
5077 }
5078
5079 let mut outbuf = alloc_uninit_vec(data.len());
5081 let len = data.len();
5082 let mut wp = 0;
5083 let mut i = 0;
5084 let mut last_squeezed: u16 = 256;
5085
5086 unsafe {
5087 let inp = data.as_ptr();
5088 let outp = outbuf.as_mut_ptr();
5089
5090 while i < len {
5091 let b = *inp.add(i);
5092 if is_member(&member, b) {
5093 if last_squeezed != b as u16 {
5094 *outp.add(wp) = b;
5095 wp += 1;
5096 last_squeezed = b as u16;
5097 }
5098 i += 1;
5099 while i < len && *inp.add(i) == b {
5100 i += 1;
5101 }
5102 } else {
5103 last_squeezed = 256;
5104 *outp.add(wp) = b;
5105 wp += 1;
5106 i += 1;
5107 }
5108 }
5109 }
5110 writer.write_all(&outbuf[..wp])
5111}
5112
5113fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
5115 let len = chunk.len();
5116 let mut out = Vec::with_capacity(len);
5117 let mut last_squeezed: u16 = 256;
5118 let mut i = 0;
5119
5120 unsafe {
5121 out.set_len(len);
5122 let inp = chunk.as_ptr();
5123 let outp: *mut u8 = out.as_mut_ptr();
5124 let mut wp = 0;
5125
5126 while i < len {
5127 let b = *inp.add(i);
5128 if is_member(member, b) {
5129 if last_squeezed != b as u16 {
5130 *outp.add(wp) = b;
5131 wp += 1;
5132 last_squeezed = b as u16;
5133 }
5134 i += 1;
5135 while i < len && *inp.add(i) == b {
5136 i += 1;
5137 }
5138 } else {
5139 last_squeezed = 256;
5140 *outp.add(wp) = b;
5141 wp += 1;
5142 i += 1;
5143 }
5144 }
5145 out.set_len(wp);
5146 }
5147 out
5148}
5149
5150fn squeeze_multi_mmap<const N: usize>(
5151 chars: &[u8],
5152 data: &[u8],
5153 writer: &mut impl Write,
5154) -> io::Result<()> {
5155 if data.len() >= PARALLEL_THRESHOLD {
5157 let member = build_member_set(chars);
5158 let n_threads = rayon::current_num_threads().max(1);
5159 let chunk_size = (data.len() / n_threads).max(32 * 1024);
5160
5161 let results: Vec<Vec<u8>> = data
5162 .par_chunks(chunk_size)
5163 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
5164 .collect();
5165
5166 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
5168 for (idx, result) in results.iter().enumerate() {
5169 if result.is_empty() {
5170 continue;
5171 }
5172 if idx > 0 {
5173 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
5174 if is_member(&member, prev_last) {
5175 let skip = result.iter().take_while(|&&b| b == prev_last).count();
5176 if skip < result.len() {
5177 slices.push(std::io::IoSlice::new(&result[skip..]));
5178 }
5179 continue;
5180 }
5181 }
5182 }
5183 slices.push(std::io::IoSlice::new(result));
5184 }
5185 return write_ioslices(writer, &slices);
5186 }
5187
5188 let single = [chars[0]; 1]; let _ = single;
5194 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(1024);
5195 let mut cursor = 0;
5196 let mut last_squeezed: u16 = 256;
5197
5198 macro_rules! find_next {
5199 ($data:expr) => {
5200 if N == 2 {
5201 memchr::memchr2(chars[0], chars[1], $data)
5202 } else {
5203 memchr::memchr3(chars[0], chars[1], chars[2], $data)
5204 }
5205 };
5206 }
5207
5208 while cursor < data.len() {
5209 match find_next!(&data[cursor..]) {
5210 Some(offset) => {
5211 let pos = cursor + offset;
5212 let b = data[pos];
5213 if pos > cursor {
5215 iov.push(std::io::IoSlice::new(&data[cursor..pos]));
5216 last_squeezed = 256;
5217 }
5218 if last_squeezed != b as u16 {
5220 iov.push(std::io::IoSlice::new(&data[pos..pos + 1]));
5222 last_squeezed = b as u16;
5223 }
5224 let mut skip = pos + 1;
5226 while skip < data.len() && data[skip] == b {
5227 skip += 1;
5228 }
5229 cursor = skip;
5230 if iov.len() >= MAX_IOV {
5232 write_ioslices(writer, &iov)?;
5233 iov.clear();
5234 }
5235 }
5236 None => {
5237 if cursor < data.len() {
5238 iov.push(std::io::IoSlice::new(&data[cursor..]));
5239 }
5240 break;
5241 }
5242 }
5243 }
5244 if !iov.is_empty() {
5245 write_ioslices(writer, &iov)?;
5246 }
5247 Ok(())
5248}
5249
5250fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
5251 if data.is_empty() {
5252 return Ok(());
5253 }
5254
5255 let pair = [ch, ch];
5257 if memchr::memmem::find(data, &pair).is_none() {
5258 return writer.write_all(data);
5259 }
5260
5261 let finder = memchr::memmem::Finder::new(&pair);
5268 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(2048);
5269 let mut cursor = 0;
5270
5271 while cursor < data.len() {
5272 match finder.find(&data[cursor..]) {
5273 Some(offset) => {
5274 let pair_pos = cursor + offset;
5275 let seg_end = pair_pos + 1;
5277 if seg_end > cursor {
5278 iov.push(std::io::IoSlice::new(&data[cursor..seg_end]));
5279 }
5280 let mut skip = seg_end;
5282 while skip < data.len() && data[skip] == ch {
5283 skip += 1;
5284 }
5285 cursor = skip;
5286 if iov.len() >= MAX_IOV {
5288 write_ioslices(writer, &iov)?;
5289 iov.clear();
5290 }
5291 }
5292 None => {
5293 if cursor < data.len() {
5295 iov.push(std::io::IoSlice::new(&data[cursor..]));
5296 }
5297 break;
5298 }
5299 }
5300 }
5301
5302 if !iov.is_empty() {
5303 write_ioslices(writer, &iov)?;
5304 }
5305 Ok(())
5306}