1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const MAX_IOV: usize = 1024;
8
9const STREAM_BUF: usize = 8 * 1024 * 1024;
13
14const PARALLEL_THRESHOLD: usize = 64 * 1024 * 1024;
20
21const SINGLE_ALLOC_LIMIT: usize = 512 * 1024 * 1024;
24
25#[cfg(target_arch = "x86_64")]
31static COMPACT_LUT: [[u8; 8]; 256] = {
32 let mut lut = [[0u8; 8]; 256];
33 let mut mask: u16 = 0;
34 while mask < 256 {
35 let mut idx: usize = 0;
36 let mut bit: u8 = 0;
37 while bit < 8 {
38 if (mask >> bit) & 1 != 0 {
39 lut[mask as usize][idx] = bit;
40 idx += 1;
41 }
42 bit += 1;
43 }
44 mask += 1;
45 }
46 lut
47};
48
49#[inline]
52fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
53 if slices.is_empty() {
54 return Ok(());
55 }
56 for batch in slices.chunks(MAX_IOV) {
57 let total: usize = batch.iter().map(|s| s.len()).sum();
58 match writer.write_vectored(batch) {
59 Ok(n) if n >= total => continue,
60 Ok(mut written) => {
61 for slice in batch {
63 let slen = slice.len();
64 if written >= slen {
65 written -= slen;
66 continue;
67 }
68 if written > 0 {
69 writer.write_all(&slice[written..])?;
70 written = 0;
71 } else {
72 writer.write_all(slice)?;
73 }
74 }
75 }
76 Err(e) => return Err(e),
77 }
78 }
79 Ok(())
80}
81
82#[inline]
86#[allow(clippy::uninit_vec)]
87fn alloc_uninit_vec(len: usize) -> Vec<u8> {
88 let mut v = Vec::with_capacity(len);
89 unsafe {
91 v.set_len(len);
92 }
93 #[cfg(target_os = "linux")]
94 if len >= 2 * 1024 * 1024 {
95 unsafe {
96 libc::madvise(
97 v.as_mut_ptr() as *mut libc::c_void,
98 len,
99 libc::MADV_HUGEPAGE,
100 );
101 }
102 }
103 v
104}
105
106#[inline]
108fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
109 let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
110 let last = set2.last().copied();
111 for (i, &from) in set1.iter().enumerate() {
112 table[from as usize] = if i < set2.len() {
113 set2[i]
114 } else {
115 last.unwrap_or(from)
116 };
117 }
118 table
119}
120
121#[inline]
123fn build_member_set(chars: &[u8]) -> [u8; 32] {
124 let mut set = [0u8; 32];
125 for &ch in chars {
126 set[ch as usize >> 3] |= 1 << (ch & 7);
127 }
128 set
129}
130
131#[inline(always)]
132fn is_member(set: &[u8; 32], ch: u8) -> bool {
133 unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
134}
135
136#[cfg(target_arch = "x86_64")]
139static SIMD_LEVEL: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
140
141#[cfg(target_arch = "x86_64")]
142#[inline(always)]
143fn get_simd_level() -> u8 {
144 let level = SIMD_LEVEL.load(std::sync::atomic::Ordering::Relaxed);
145 if level != 0 {
146 return level;
147 }
148 let detected = if is_x86_feature_detected!("avx2") {
149 3
150 } else if is_x86_feature_detected!("ssse3") {
151 2
152 } else {
153 1
154 };
155 SIMD_LEVEL.store(detected, std::sync::atomic::Ordering::Relaxed);
156 detected
157}
158
159#[cfg(target_arch = "x86_64")]
161#[inline]
162fn count_non_identity(table: &[u8; 256]) -> usize {
163 table
164 .iter()
165 .enumerate()
166 .filter(|&(i, &v)| v != i as u8)
167 .count()
168}
169
170#[inline(always)]
176fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
177 #[cfg(target_arch = "x86_64")]
178 {
179 let level = get_simd_level();
180 if level >= 3 {
181 let non_id = count_non_identity(table);
186 if non_id > 0 && non_id <= 16 {
187 unsafe { translate_inplace_avx2_sparse(data, table) };
188 return;
189 }
190 unsafe { translate_inplace_avx2_table(data, table) };
191 return;
192 }
193 if level >= 2 {
194 unsafe { translate_inplace_ssse3_table(data, table) };
195 return;
196 }
197 }
198 translate_inplace_scalar(data, table);
199}
200
201#[cfg(target_arch = "x86_64")]
207#[target_feature(enable = "avx2")]
208unsafe fn translate_inplace_avx2_sparse(data: &mut [u8], table: &[u8; 256]) {
209 use std::arch::x86_64::*;
210
211 unsafe {
212 let len = data.len();
213 let ptr = data.as_mut_ptr();
214
215 let mut lut = [_mm256_setzero_si256(); 16];
217 for h in 0u8..16 {
218 let base = (h as usize) * 16;
219 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
220 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
221 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
222 }
223
224 let lo_mask = _mm256_set1_epi8(0x0F);
225
226 let mut i = 0;
227 while i + 32 <= len {
228 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
229 let lo_nibble = _mm256_and_si256(input, lo_mask);
230 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
231
232 let mut result = _mm256_setzero_si256();
233 macro_rules! do_nibble {
234 ($h:expr) => {
235 let h_val = _mm256_set1_epi8($h as i8);
236 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
237 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
238 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
239 };
240 }
241 do_nibble!(0);
242 do_nibble!(1);
243 do_nibble!(2);
244 do_nibble!(3);
245 do_nibble!(4);
246 do_nibble!(5);
247 do_nibble!(6);
248 do_nibble!(7);
249 do_nibble!(8);
250 do_nibble!(9);
251 do_nibble!(10);
252 do_nibble!(11);
253 do_nibble!(12);
254 do_nibble!(13);
255 do_nibble!(14);
256 do_nibble!(15);
257
258 let diff = _mm256_xor_si256(input, result);
260 if _mm256_testz_si256(diff, diff) == 0 {
261 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
262 }
263 i += 32;
264 }
265
266 while i < len {
268 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
269 i += 1;
270 }
271 }
272}
273
274#[cfg(not(target_arch = "aarch64"))]
276#[inline(always)]
277fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
278 let len = data.len();
279 let ptr = data.as_mut_ptr();
280 let mut i = 0;
281 unsafe {
282 while i + 8 <= len {
283 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
284 *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
285 *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
286 *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
287 *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
288 *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
289 *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
290 *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
291 i += 8;
292 }
293 while i < len {
294 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
295 i += 1;
296 }
297 }
298}
299
300#[cfg(target_arch = "aarch64")]
303#[inline(always)]
304fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
305 unsafe { translate_inplace_neon_table(data, table) };
306}
307
308#[cfg(target_arch = "aarch64")]
309#[target_feature(enable = "neon")]
310unsafe fn translate_inplace_neon_table(data: &mut [u8], table: &[u8; 256]) {
311 use std::arch::aarch64::*;
312
313 unsafe {
314 let len = data.len();
315 let ptr = data.as_mut_ptr();
316
317 let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
319 for h in 0u8..16 {
320 let base = (h as usize) * 16;
321 lut[h as usize] = vld1q_u8(table.as_ptr().add(base));
322 }
323
324 let lo_mask = vdupq_n_u8(0x0F);
325 let mut i = 0;
326
327 while i + 16 <= len {
328 let input = vld1q_u8(ptr.add(i));
329 let lo_nibble = vandq_u8(input, lo_mask);
330 let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
331
332 let mut result = vdupq_n_u8(0);
333 macro_rules! do_nibble {
334 ($h:expr) => {
335 let h_val = vdupq_n_u8($h);
336 let mask = vceqq_u8(hi_nibble, h_val);
337 let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
338 result = vorrq_u8(result, vandq_u8(mask, looked_up));
339 };
340 }
341 do_nibble!(0);
342 do_nibble!(1);
343 do_nibble!(2);
344 do_nibble!(3);
345 do_nibble!(4);
346 do_nibble!(5);
347 do_nibble!(6);
348 do_nibble!(7);
349 do_nibble!(8);
350 do_nibble!(9);
351 do_nibble!(10);
352 do_nibble!(11);
353 do_nibble!(12);
354 do_nibble!(13);
355 do_nibble!(14);
356 do_nibble!(15);
357
358 vst1q_u8(ptr.add(i), result);
359 i += 16;
360 }
361
362 while i < len {
364 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
365 i += 1;
366 }
367 }
368}
369
370#[cfg(target_arch = "x86_64")]
390#[target_feature(enable = "avx2")]
391unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
392 use std::arch::x86_64::*;
393
394 unsafe {
395 let len = data.len();
396 let ptr = data.as_mut_ptr();
397
398 let mut lut = [_mm256_setzero_si256(); 16];
402 for h in 0u8..16 {
403 let base = (h as usize) * 16;
404 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
405 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
407 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
408 }
409
410 let lo_mask = _mm256_set1_epi8(0x0F);
411
412 let mut i = 0;
413
414 while i + 64 <= len {
418 let input0 = _mm256_loadu_si256(ptr.add(i) as *const _);
419 let input1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
420
421 let lo0 = _mm256_and_si256(input0, lo_mask);
422 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
423 let lo1 = _mm256_and_si256(input1, lo_mask);
424 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
425
426 let mut r0 = _mm256_setzero_si256();
427 let mut r1 = _mm256_setzero_si256();
428
429 macro_rules! do_nibble2 {
430 ($h:expr) => {
431 let h_val = _mm256_set1_epi8($h as i8);
432 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
433 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
434 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
435 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
436 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
437 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
438 };
439 }
440 do_nibble2!(0);
441 do_nibble2!(1);
442 do_nibble2!(2);
443 do_nibble2!(3);
444 do_nibble2!(4);
445 do_nibble2!(5);
446 do_nibble2!(6);
447 do_nibble2!(7);
448 do_nibble2!(8);
449 do_nibble2!(9);
450 do_nibble2!(10);
451 do_nibble2!(11);
452 do_nibble2!(12);
453 do_nibble2!(13);
454 do_nibble2!(14);
455 do_nibble2!(15);
456
457 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
458 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
459 i += 64;
460 }
461
462 if i + 32 <= len {
464 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
465 let lo_nibble = _mm256_and_si256(input, lo_mask);
466 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
467
468 let mut result = _mm256_setzero_si256();
469
470 macro_rules! do_nibble {
471 ($h:expr) => {
472 let h_val = _mm256_set1_epi8($h as i8);
473 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
474 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
475 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
476 };
477 }
478 do_nibble!(0);
479 do_nibble!(1);
480 do_nibble!(2);
481 do_nibble!(3);
482 do_nibble!(4);
483 do_nibble!(5);
484 do_nibble!(6);
485 do_nibble!(7);
486 do_nibble!(8);
487 do_nibble!(9);
488 do_nibble!(10);
489 do_nibble!(11);
490 do_nibble!(12);
491 do_nibble!(13);
492 do_nibble!(14);
493 do_nibble!(15);
494
495 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
496 i += 32;
497 }
498
499 if i + 16 <= len {
501 let lo_mask128 = _mm_set1_epi8(0x0F);
502
503 let mut lut128 = [_mm_setzero_si128(); 16];
504 for h in 0u8..16 {
505 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
506 }
507
508 let input = _mm_loadu_si128(ptr.add(i) as *const _);
509 let lo_nib = _mm_and_si128(input, lo_mask128);
510 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
511
512 let mut res = _mm_setzero_si128();
513 macro_rules! do_nibble128 {
514 ($h:expr) => {
515 let h_val = _mm_set1_epi8($h as i8);
516 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
517 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
518 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
519 };
520 }
521 do_nibble128!(0);
522 do_nibble128!(1);
523 do_nibble128!(2);
524 do_nibble128!(3);
525 do_nibble128!(4);
526 do_nibble128!(5);
527 do_nibble128!(6);
528 do_nibble128!(7);
529 do_nibble128!(8);
530 do_nibble128!(9);
531 do_nibble128!(10);
532 do_nibble128!(11);
533 do_nibble128!(12);
534 do_nibble128!(13);
535 do_nibble128!(14);
536 do_nibble128!(15);
537
538 _mm_storeu_si128(ptr.add(i) as *mut _, res);
539 i += 16;
540 }
541
542 while i < len {
544 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
545 i += 1;
546 }
547 }
548}
549
550#[cfg(target_arch = "x86_64")]
551#[target_feature(enable = "ssse3")]
552unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
553 use std::arch::x86_64::*;
554
555 unsafe {
556 let len = data.len();
557 let ptr = data.as_mut_ptr();
558
559 let mut lut = [_mm_setzero_si128(); 16];
561 for h in 0u8..16 {
562 let base = (h as usize) * 16;
563 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
564 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
565 }
566
567 let lo_mask = _mm_set1_epi8(0x0F);
568
569 let mut i = 0;
570 while i + 16 <= len {
571 let input = _mm_loadu_si128(ptr.add(i) as *const _);
572 let lo_nibble = _mm_and_si128(input, lo_mask);
573 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
574
575 let mut result = _mm_setzero_si128();
576
577 macro_rules! do_nibble {
578 ($h:expr) => {
579 let h_val = _mm_set1_epi8($h as i8);
580 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
581 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
582 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
583 };
584 }
585 do_nibble!(0);
586 do_nibble!(1);
587 do_nibble!(2);
588 do_nibble!(3);
589 do_nibble!(4);
590 do_nibble!(5);
591 do_nibble!(6);
592 do_nibble!(7);
593 do_nibble!(8);
594 do_nibble!(9);
595 do_nibble!(10);
596 do_nibble!(11);
597 do_nibble!(12);
598 do_nibble!(13);
599 do_nibble!(14);
600 do_nibble!(15);
601
602 _mm_storeu_si128(ptr.add(i) as *mut _, result);
603 i += 16;
604 }
605
606 while i < len {
608 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
609 i += 1;
610 }
611 }
612}
613
614#[inline(always)]
617fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
618 debug_assert!(dst.len() >= src.len());
619 #[cfg(target_arch = "x86_64")]
620 {
621 let level = get_simd_level();
622 if level >= 3 {
623 if dst.as_ptr() as usize & 31 == 0 {
625 unsafe { translate_to_avx2_table_nt(src, dst, table) };
626 } else {
627 unsafe { translate_to_avx2_table(src, dst, table) };
628 }
629 return;
630 }
631 if level >= 2 {
632 unsafe { translate_to_ssse3_table(src, dst, table) };
633 return;
634 }
635 }
636 translate_to_scalar(src, dst, table);
637}
638
639#[cfg(not(target_arch = "aarch64"))]
641#[inline(always)]
642fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
643 unsafe {
644 let sp = src.as_ptr();
645 let dp = dst.as_mut_ptr();
646 let len = src.len();
647 let mut i = 0;
648 while i + 8 <= len {
649 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
650 *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
651 *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
652 *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
653 *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
654 *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
655 *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
656 *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
657 i += 8;
658 }
659 while i < len {
660 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
661 i += 1;
662 }
663 }
664}
665
666#[cfg(target_arch = "aarch64")]
668#[inline(always)]
669fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
670 unsafe { translate_to_neon_table(src, dst, table) };
671}
672
673#[cfg(target_arch = "aarch64")]
674#[target_feature(enable = "neon")]
675unsafe fn translate_to_neon_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
676 use std::arch::aarch64::*;
677
678 unsafe {
679 let len = src.len();
680 let sp = src.as_ptr();
681 let dp = dst.as_mut_ptr();
682
683 let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
684 for h in 0u8..16 {
685 lut[h as usize] = vld1q_u8(table.as_ptr().add((h as usize) * 16));
686 }
687
688 let lo_mask = vdupq_n_u8(0x0F);
689 let mut i = 0;
690
691 while i + 16 <= len {
692 let input = vld1q_u8(sp.add(i));
693 let lo_nibble = vandq_u8(input, lo_mask);
694 let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
695
696 let mut result = vdupq_n_u8(0);
697 macro_rules! do_nibble {
698 ($h:expr) => {
699 let h_val = vdupq_n_u8($h);
700 let mask = vceqq_u8(hi_nibble, h_val);
701 let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
702 result = vorrq_u8(result, vandq_u8(mask, looked_up));
703 };
704 }
705 do_nibble!(0);
706 do_nibble!(1);
707 do_nibble!(2);
708 do_nibble!(3);
709 do_nibble!(4);
710 do_nibble!(5);
711 do_nibble!(6);
712 do_nibble!(7);
713 do_nibble!(8);
714 do_nibble!(9);
715 do_nibble!(10);
716 do_nibble!(11);
717 do_nibble!(12);
718 do_nibble!(13);
719 do_nibble!(14);
720 do_nibble!(15);
721
722 vst1q_u8(dp.add(i), result);
723 i += 16;
724 }
725
726 while i < len {
727 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
728 i += 1;
729 }
730 }
731}
732
733#[cfg(target_arch = "x86_64")]
734#[target_feature(enable = "avx2")]
735unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
736 use std::arch::x86_64::*;
737
738 unsafe {
739 let len = src.len();
740 let sp = src.as_ptr();
741 let dp = dst.as_mut_ptr();
742
743 let mut lut = [_mm256_setzero_si256(); 16];
745 for h in 0u8..16 {
746 let base = (h as usize) * 16;
747 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
748 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
749 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
750 }
751
752 let lo_mask = _mm256_set1_epi8(0x0F);
753
754 let mut i = 0;
755
756 while i + 64 <= len {
758 let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
759 let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
760
761 let lo0 = _mm256_and_si256(input0, lo_mask);
762 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
763 let lo1 = _mm256_and_si256(input1, lo_mask);
764 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
765
766 let mut r0 = _mm256_setzero_si256();
767 let mut r1 = _mm256_setzero_si256();
768
769 macro_rules! do_nibble2 {
770 ($h:expr) => {
771 let h_val = _mm256_set1_epi8($h as i8);
772 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
773 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
774 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
775 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
776 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
777 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
778 };
779 }
780 do_nibble2!(0);
781 do_nibble2!(1);
782 do_nibble2!(2);
783 do_nibble2!(3);
784 do_nibble2!(4);
785 do_nibble2!(5);
786 do_nibble2!(6);
787 do_nibble2!(7);
788 do_nibble2!(8);
789 do_nibble2!(9);
790 do_nibble2!(10);
791 do_nibble2!(11);
792 do_nibble2!(12);
793 do_nibble2!(13);
794 do_nibble2!(14);
795 do_nibble2!(15);
796
797 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
798 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
799 i += 64;
800 }
801
802 if i + 32 <= len {
804 let input = _mm256_loadu_si256(sp.add(i) as *const _);
805 let lo_nibble = _mm256_and_si256(input, lo_mask);
806 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
807
808 let mut result = _mm256_setzero_si256();
809
810 macro_rules! do_nibble {
811 ($h:expr) => {
812 let h_val = _mm256_set1_epi8($h as i8);
813 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
814 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
815 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
816 };
817 }
818 do_nibble!(0);
819 do_nibble!(1);
820 do_nibble!(2);
821 do_nibble!(3);
822 do_nibble!(4);
823 do_nibble!(5);
824 do_nibble!(6);
825 do_nibble!(7);
826 do_nibble!(8);
827 do_nibble!(9);
828 do_nibble!(10);
829 do_nibble!(11);
830 do_nibble!(12);
831 do_nibble!(13);
832 do_nibble!(14);
833 do_nibble!(15);
834
835 _mm256_storeu_si256(dp.add(i) as *mut _, result);
836 i += 32;
837 }
838
839 if i + 16 <= len {
841 let lo_mask128 = _mm_set1_epi8(0x0F);
842 let mut lut128 = [_mm_setzero_si128(); 16];
843 for h in 0u8..16 {
844 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
845 }
846
847 let input = _mm_loadu_si128(sp.add(i) as *const _);
848 let lo_nib = _mm_and_si128(input, lo_mask128);
849 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
850
851 let mut res = _mm_setzero_si128();
852 macro_rules! do_nibble128 {
853 ($h:expr) => {
854 let h_val = _mm_set1_epi8($h as i8);
855 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
856 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
857 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
858 };
859 }
860 do_nibble128!(0);
861 do_nibble128!(1);
862 do_nibble128!(2);
863 do_nibble128!(3);
864 do_nibble128!(4);
865 do_nibble128!(5);
866 do_nibble128!(6);
867 do_nibble128!(7);
868 do_nibble128!(8);
869 do_nibble128!(9);
870 do_nibble128!(10);
871 do_nibble128!(11);
872 do_nibble128!(12);
873 do_nibble128!(13);
874 do_nibble128!(14);
875 do_nibble128!(15);
876
877 _mm_storeu_si128(dp.add(i) as *mut _, res);
878 i += 16;
879 }
880
881 while i < len {
883 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
884 i += 1;
885 }
886 }
887}
888
889#[cfg(target_arch = "x86_64")]
892#[target_feature(enable = "avx2")]
893unsafe fn translate_to_avx2_table_nt(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
894 use std::arch::x86_64::*;
895
896 unsafe {
897 let len = src.len();
898 let sp = src.as_ptr();
899 let dp = dst.as_mut_ptr();
900
901 let mut lut = [_mm256_setzero_si256(); 16];
903 for h in 0u8..16 {
904 let base = (h as usize) * 16;
905 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
906 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
907 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
908 }
909
910 let lo_mask = _mm256_set1_epi8(0x0F);
911 let mut i = 0;
912
913 while i + 64 <= len {
915 let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
916 let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
917
918 let lo0 = _mm256_and_si256(input0, lo_mask);
919 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
920 let lo1 = _mm256_and_si256(input1, lo_mask);
921 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
922
923 let mut r0 = _mm256_setzero_si256();
924 let mut r1 = _mm256_setzero_si256();
925
926 macro_rules! do_nibble2 {
927 ($h:expr) => {
928 let h_val = _mm256_set1_epi8($h as i8);
929 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
930 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
931 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
932 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
933 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
934 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
935 };
936 }
937 do_nibble2!(0);
938 do_nibble2!(1);
939 do_nibble2!(2);
940 do_nibble2!(3);
941 do_nibble2!(4);
942 do_nibble2!(5);
943 do_nibble2!(6);
944 do_nibble2!(7);
945 do_nibble2!(8);
946 do_nibble2!(9);
947 do_nibble2!(10);
948 do_nibble2!(11);
949 do_nibble2!(12);
950 do_nibble2!(13);
951 do_nibble2!(14);
952 do_nibble2!(15);
953
954 _mm256_stream_si256(dp.add(i) as *mut _, r0);
955 _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
956 i += 64;
957 }
958
959 if i + 32 <= len {
961 let input = _mm256_loadu_si256(sp.add(i) as *const _);
962 let lo_nibble = _mm256_and_si256(input, lo_mask);
963 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
964
965 let mut result = _mm256_setzero_si256();
966 macro_rules! do_nibble {
967 ($h:expr) => {
968 let h_val = _mm256_set1_epi8($h as i8);
969 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
970 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
971 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
972 };
973 }
974 do_nibble!(0);
975 do_nibble!(1);
976 do_nibble!(2);
977 do_nibble!(3);
978 do_nibble!(4);
979 do_nibble!(5);
980 do_nibble!(6);
981 do_nibble!(7);
982 do_nibble!(8);
983 do_nibble!(9);
984 do_nibble!(10);
985 do_nibble!(11);
986 do_nibble!(12);
987 do_nibble!(13);
988 do_nibble!(14);
989 do_nibble!(15);
990
991 _mm256_stream_si256(dp.add(i) as *mut _, result);
992 i += 32;
993 }
994
995 if i + 16 <= len {
997 let lo_mask128 = _mm_set1_epi8(0x0F);
998 let mut lut128 = [_mm_setzero_si128(); 16];
999 for h in 0u8..16 {
1000 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
1001 }
1002
1003 let input = _mm_loadu_si128(sp.add(i) as *const _);
1004 let lo_nib = _mm_and_si128(input, lo_mask128);
1005 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
1006
1007 let mut res = _mm_setzero_si128();
1008 macro_rules! do_nibble128 {
1009 ($h:expr) => {
1010 let h_val = _mm_set1_epi8($h as i8);
1011 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
1012 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
1013 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
1014 };
1015 }
1016 do_nibble128!(0);
1017 do_nibble128!(1);
1018 do_nibble128!(2);
1019 do_nibble128!(3);
1020 do_nibble128!(4);
1021 do_nibble128!(5);
1022 do_nibble128!(6);
1023 do_nibble128!(7);
1024 do_nibble128!(8);
1025 do_nibble128!(9);
1026 do_nibble128!(10);
1027 do_nibble128!(11);
1028 do_nibble128!(12);
1029 do_nibble128!(13);
1030 do_nibble128!(14);
1031 do_nibble128!(15);
1032
1033 _mm_storeu_si128(dp.add(i) as *mut _, res);
1034 i += 16;
1035 }
1036
1037 while i < len {
1039 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1040 i += 1;
1041 }
1042
1043 _mm_sfence();
1045 }
1046}
1047
1048#[cfg(target_arch = "x86_64")]
1049#[target_feature(enable = "ssse3")]
1050unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
1051 use std::arch::x86_64::*;
1052
1053 unsafe {
1054 let len = src.len();
1055 let sp = src.as_ptr();
1056 let dp = dst.as_mut_ptr();
1057
1058 let mut lut = [_mm_setzero_si128(); 16];
1059 for h in 0u8..16 {
1060 let base = (h as usize) * 16;
1061 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
1062 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
1063 }
1064
1065 let lo_mask = _mm_set1_epi8(0x0F);
1066
1067 let mut i = 0;
1068 while i + 16 <= len {
1069 let input = _mm_loadu_si128(sp.add(i) as *const _);
1070 let lo_nibble = _mm_and_si128(input, lo_mask);
1071 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
1072
1073 let mut result = _mm_setzero_si128();
1074
1075 macro_rules! do_nibble {
1076 ($h:expr) => {
1077 let h_val = _mm_set1_epi8($h as i8);
1078 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
1079 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
1080 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
1081 };
1082 }
1083 do_nibble!(0);
1084 do_nibble!(1);
1085 do_nibble!(2);
1086 do_nibble!(3);
1087 do_nibble!(4);
1088 do_nibble!(5);
1089 do_nibble!(6);
1090 do_nibble!(7);
1091 do_nibble!(8);
1092 do_nibble!(9);
1093 do_nibble!(10);
1094 do_nibble!(11);
1095 do_nibble!(12);
1096 do_nibble!(13);
1097 do_nibble!(14);
1098 do_nibble!(15);
1099
1100 _mm_storeu_si128(dp.add(i) as *mut _, result);
1101 i += 16;
1102 }
1103
1104 while i < len {
1106 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1107 i += 1;
1108 }
1109 }
1110}
1111
1112#[inline]
1120fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
1121 let mut lo: Option<u8> = None;
1122 let mut hi = 0u8;
1123 let mut offset = 0i16;
1124
1125 for i in 0..256 {
1126 if table[i] != i as u8 {
1127 let diff = table[i] as i16 - i as i16;
1128 match lo {
1129 None => {
1130 lo = Some(i as u8);
1131 hi = i as u8;
1132 offset = diff;
1133 }
1134 Some(_) => {
1135 if diff != offset || i as u8 != hi.wrapping_add(1) {
1136 return None;
1137 }
1138 hi = i as u8;
1139 }
1140 }
1141 }
1142 }
1143
1144 lo.map(|l| (l, hi, offset as i8))
1145}
1146
1147#[inline]
1152fn detect_range_to_constant(table: &[u8; 256]) -> Option<(u8, u8, u8)> {
1153 let mut lo: Option<u8> = None;
1154 let mut hi = 0u8;
1155 let mut replacement = 0u8;
1156
1157 for i in 0..256 {
1158 if table[i] != i as u8 {
1159 match lo {
1160 None => {
1161 lo = Some(i as u8);
1162 hi = i as u8;
1163 replacement = table[i];
1164 }
1165 Some(_) => {
1166 if table[i] != replacement || i as u8 != hi.wrapping_add(1) {
1167 return None;
1168 }
1169 hi = i as u8;
1170 }
1171 }
1172 }
1173 }
1174
1175 lo.map(|l| (l, hi, replacement))
1176}
1177
1178#[cfg(target_arch = "x86_64")]
1183fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1184 if get_simd_level() >= 3 {
1185 unsafe { translate_range_to_constant_avx2_inplace(data, lo, hi, replacement) };
1186 } else {
1187 unsafe { translate_range_to_constant_sse2_inplace(data, lo, hi, replacement) };
1188 }
1189}
1190
1191#[cfg(target_arch = "x86_64")]
1192#[target_feature(enable = "avx2")]
1193unsafe fn translate_range_to_constant_avx2_inplace(
1194 data: &mut [u8],
1195 lo: u8,
1196 hi: u8,
1197 replacement: u8,
1198) {
1199 use std::arch::x86_64::*;
1200
1201 unsafe {
1202 let range = hi - lo;
1203 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1204 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1205 let repl_v = _mm256_set1_epi8(replacement as i8);
1206 let zero = _mm256_setzero_si256();
1207
1208 let len = data.len();
1209 let ptr = data.as_mut_ptr();
1210 let mut i = 0;
1211
1212 while i + 64 <= len {
1214 let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1215 let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1216 let bi0 = _mm256_add_epi8(in0, bias_v);
1217 let bi1 = _mm256_add_epi8(in1, bias_v);
1218 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1219 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1220 let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1221 let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1222 let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1223 let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1224 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1225 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1226 i += 64;
1227 }
1228
1229 if i + 32 <= len {
1231 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1232 let biased = _mm256_add_epi8(input, bias_v);
1233 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1234 let in_range = _mm256_cmpeq_epi8(gt, zero);
1235 let result = _mm256_blendv_epi8(input, repl_v, in_range);
1236 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1237 i += 32;
1238 }
1239
1240 if i + 16 <= len {
1241 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1242 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1243 let repl_v128 = _mm_set1_epi8(replacement as i8);
1244 let zero128 = _mm_setzero_si128();
1245
1246 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1247 let biased = _mm_add_epi8(input, bias_v128);
1248 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1249 let in_range = _mm_cmpeq_epi8(gt, zero128);
1250 let result = _mm_blendv_epi8(input, repl_v128, in_range);
1251 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1252 i += 16;
1253 }
1254
1255 while i < len {
1256 let b = *ptr.add(i);
1257 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1258 i += 1;
1259 }
1260 }
1261}
1262
1263#[cfg(target_arch = "x86_64")]
1264#[target_feature(enable = "sse2")]
1265unsafe fn translate_range_to_constant_sse2_inplace(
1266 data: &mut [u8],
1267 lo: u8,
1268 hi: u8,
1269 replacement: u8,
1270) {
1271 use std::arch::x86_64::*;
1272
1273 unsafe {
1274 let range = hi - lo;
1275 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1276 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1277 let repl_v = _mm_set1_epi8(replacement as i8);
1278 let zero = _mm_setzero_si128();
1279
1280 let len = data.len();
1281 let ptr = data.as_mut_ptr();
1282 let mut i = 0;
1283
1284 while i + 16 <= len {
1285 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1286 let biased = _mm_add_epi8(input, bias_v);
1287 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1288 let in_range = _mm_cmpeq_epi8(gt, zero);
1290 let result = _mm_or_si128(
1292 _mm_and_si128(in_range, repl_v),
1293 _mm_andnot_si128(in_range, input),
1294 );
1295 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1296 i += 16;
1297 }
1298
1299 while i < len {
1300 let b = *ptr.add(i);
1301 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1302 i += 1;
1303 }
1304 }
1305}
1306
1307#[cfg(target_arch = "aarch64")]
1308fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1309 unsafe { translate_range_to_constant_neon_inplace(data, lo, hi, replacement) };
1310}
1311
1312#[cfg(target_arch = "aarch64")]
1313#[target_feature(enable = "neon")]
1314unsafe fn translate_range_to_constant_neon_inplace(
1315 data: &mut [u8],
1316 lo: u8,
1317 hi: u8,
1318 replacement: u8,
1319) {
1320 use std::arch::aarch64::*;
1321
1322 unsafe {
1323 let len = data.len();
1324 let ptr = data.as_mut_ptr();
1325 let lo_v = vdupq_n_u8(lo);
1326 let hi_v = vdupq_n_u8(hi);
1327 let repl_v = vdupq_n_u8(replacement);
1328 let mut i = 0;
1329
1330 while i + 32 <= len {
1331 let in0 = vld1q_u8(ptr.add(i));
1332 let in1 = vld1q_u8(ptr.add(i + 16));
1333 let ge0 = vcgeq_u8(in0, lo_v);
1334 let le0 = vcleq_u8(in0, hi_v);
1335 let mask0 = vandq_u8(ge0, le0);
1336 let ge1 = vcgeq_u8(in1, lo_v);
1337 let le1 = vcleq_u8(in1, hi_v);
1338 let mask1 = vandq_u8(ge1, le1);
1339 vst1q_u8(ptr.add(i), vbslq_u8(mask0, repl_v, in0));
1341 vst1q_u8(ptr.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1342 i += 32;
1343 }
1344
1345 if i + 16 <= len {
1346 let input = vld1q_u8(ptr.add(i));
1347 let ge = vcgeq_u8(input, lo_v);
1348 let le = vcleq_u8(input, hi_v);
1349 let mask = vandq_u8(ge, le);
1350 vst1q_u8(ptr.add(i), vbslq_u8(mask, repl_v, input));
1351 i += 16;
1352 }
1353
1354 while i < len {
1355 let b = *ptr.add(i);
1356 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1357 i += 1;
1358 }
1359 }
1360}
1361
1362#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1363fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1364 for b in data.iter_mut() {
1365 if *b >= lo && *b <= hi {
1366 *b = replacement;
1367 }
1368 }
1369}
1370
1371#[cfg(target_arch = "x86_64")]
1374fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1375 if get_simd_level() >= 3 {
1376 unsafe { translate_range_to_constant_avx2(src, dst, lo, hi, replacement) };
1377 } else {
1378 unsafe { translate_range_to_constant_sse2(src, dst, lo, hi, replacement) };
1379 }
1380}
1381
1382#[cfg(target_arch = "aarch64")]
1383fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1384 unsafe { translate_range_to_constant_neon(src, dst, lo, hi, replacement) };
1385}
1386
1387#[cfg(target_arch = "aarch64")]
1388#[target_feature(enable = "neon")]
1389unsafe fn translate_range_to_constant_neon(
1390 src: &[u8],
1391 dst: &mut [u8],
1392 lo: u8,
1393 hi: u8,
1394 replacement: u8,
1395) {
1396 use std::arch::aarch64::*;
1397
1398 unsafe {
1399 let len = src.len();
1400 let sp = src.as_ptr();
1401 let dp = dst.as_mut_ptr();
1402 let lo_v = vdupq_n_u8(lo);
1403 let hi_v = vdupq_n_u8(hi);
1404 let repl_v = vdupq_n_u8(replacement);
1405 let mut i = 0;
1406
1407 while i + 32 <= len {
1408 let in0 = vld1q_u8(sp.add(i));
1409 let in1 = vld1q_u8(sp.add(i + 16));
1410 let mask0 = vandq_u8(vcgeq_u8(in0, lo_v), vcleq_u8(in0, hi_v));
1411 let mask1 = vandq_u8(vcgeq_u8(in1, lo_v), vcleq_u8(in1, hi_v));
1412 vst1q_u8(dp.add(i), vbslq_u8(mask0, repl_v, in0));
1413 vst1q_u8(dp.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1414 i += 32;
1415 }
1416
1417 if i + 16 <= len {
1418 let input = vld1q_u8(sp.add(i));
1419 let mask = vandq_u8(vcgeq_u8(input, lo_v), vcleq_u8(input, hi_v));
1420 vst1q_u8(dp.add(i), vbslq_u8(mask, repl_v, input));
1421 i += 16;
1422 }
1423
1424 while i < len {
1425 let b = *sp.add(i);
1426 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1427 i += 1;
1428 }
1429 }
1430}
1431
1432#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1433fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1434 for (i, &b) in src.iter().enumerate() {
1435 unsafe {
1436 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi { replacement } else { b };
1437 }
1438 }
1439}
1440
1441#[cfg(target_arch = "x86_64")]
1442#[target_feature(enable = "avx2")]
1443unsafe fn translate_range_to_constant_avx2(
1444 src: &[u8],
1445 dst: &mut [u8],
1446 lo: u8,
1447 hi: u8,
1448 replacement: u8,
1449) {
1450 use std::arch::x86_64::*;
1451 unsafe {
1452 let range = hi - lo;
1453 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1454 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1455 let repl_v = _mm256_set1_epi8(replacement as i8);
1456 let zero = _mm256_setzero_si256();
1457 let len = src.len();
1458 let sp = src.as_ptr();
1459 let dp = dst.as_mut_ptr();
1460 let mut i = 0;
1461 while i + 64 <= len {
1462 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1463 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1464 let bi0 = _mm256_add_epi8(in0, bias_v);
1465 let bi1 = _mm256_add_epi8(in1, bias_v);
1466 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1467 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1468 let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1469 let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1470 let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1471 let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1472 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1473 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1474 i += 64;
1475 }
1476 if i + 32 <= len {
1477 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1478 let biased = _mm256_add_epi8(input, bias_v);
1479 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1480 let in_range = _mm256_cmpeq_epi8(gt, zero);
1481 let result = _mm256_blendv_epi8(input, repl_v, in_range);
1482 _mm256_storeu_si256(dp.add(i) as *mut _, result);
1483 i += 32;
1484 }
1485 while i < len {
1486 let b = *sp.add(i);
1487 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1488 i += 1;
1489 }
1490 }
1491}
1492
1493#[cfg(target_arch = "x86_64")]
1494#[target_feature(enable = "sse2")]
1495unsafe fn translate_range_to_constant_sse2(
1496 src: &[u8],
1497 dst: &mut [u8],
1498 lo: u8,
1499 hi: u8,
1500 replacement: u8,
1501) {
1502 use std::arch::x86_64::*;
1503 unsafe {
1504 let range = hi - lo;
1505 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1506 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1507 let repl_v = _mm_set1_epi8(replacement as i8);
1508 let zero = _mm_setzero_si128();
1509 let len = src.len();
1510 let sp = src.as_ptr();
1511 let dp = dst.as_mut_ptr();
1512 let mut i = 0;
1513 while i + 16 <= len {
1514 let input = _mm_loadu_si128(sp.add(i) as *const _);
1515 let biased = _mm_add_epi8(input, bias_v);
1516 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1517 let in_range = _mm_cmpeq_epi8(gt, zero);
1518 let result = _mm_or_si128(
1519 _mm_and_si128(in_range, repl_v),
1520 _mm_andnot_si128(in_range, input),
1521 );
1522 _mm_storeu_si128(dp.add(i) as *mut _, result);
1523 i += 16;
1524 }
1525 while i < len {
1526 let b = *sp.add(i);
1527 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1528 i += 1;
1529 }
1530 }
1531}
1532
1533#[cfg(target_arch = "x86_64")]
1540fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1541 if get_simd_level() >= 3 {
1542 if dst.as_ptr() as usize & 31 == 0 {
1544 unsafe { translate_range_avx2_nt(src, dst, lo, hi, offset) };
1545 } else {
1546 unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
1547 }
1548 } else {
1549 unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
1550 }
1551}
1552
1553#[cfg(target_arch = "x86_64")]
1554#[target_feature(enable = "avx2")]
1555unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1556 use std::arch::x86_64::*;
1557
1558 unsafe {
1559 let range = hi - lo;
1560 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1565 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1566 let offset_v = _mm256_set1_epi8(offset);
1567 let zero = _mm256_setzero_si256();
1568
1569 let len = src.len();
1570 let sp = src.as_ptr();
1571 let dp = dst.as_mut_ptr();
1572 let mut i = 0;
1573
1574 while i + 64 <= len {
1577 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1578 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1579 let bi0 = _mm256_add_epi8(in0, bias_v);
1580 let bi1 = _mm256_add_epi8(in1, bias_v);
1581 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1582 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1583 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1584 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1585 let om0 = _mm256_and_si256(m0, offset_v);
1586 let om1 = _mm256_and_si256(m1, offset_v);
1587 let r0 = _mm256_add_epi8(in0, om0);
1588 let r1 = _mm256_add_epi8(in1, om1);
1589 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1590 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1591 i += 64;
1592 }
1593
1594 if i + 32 <= len {
1596 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1597 let biased = _mm256_add_epi8(input, bias_v);
1598 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1599 let mask = _mm256_cmpeq_epi8(gt, zero);
1600 let offset_masked = _mm256_and_si256(mask, offset_v);
1601 let result = _mm256_add_epi8(input, offset_masked);
1602 _mm256_storeu_si256(dp.add(i) as *mut _, result);
1603 i += 32;
1604 }
1605
1606 if i + 16 <= len {
1608 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1609 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1610 let offset_v128 = _mm_set1_epi8(offset);
1611 let zero128 = _mm_setzero_si128();
1612
1613 let input = _mm_loadu_si128(sp.add(i) as *const _);
1614 let biased = _mm_add_epi8(input, bias_v128);
1615 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1616 let mask = _mm_cmpeq_epi8(gt, zero128);
1617 let offset_masked = _mm_and_si128(mask, offset_v128);
1618 let result = _mm_add_epi8(input, offset_masked);
1619 _mm_storeu_si128(dp.add(i) as *mut _, result);
1620 i += 16;
1621 }
1622
1623 while i < len {
1625 let b = *sp.add(i);
1626 *dp.add(i) = if b >= lo && b <= hi {
1627 b.wrapping_add(offset as u8)
1628 } else {
1629 b
1630 };
1631 i += 1;
1632 }
1633 }
1634}
1635
1636#[cfg(target_arch = "x86_64")]
1642#[target_feature(enable = "avx2")]
1643unsafe fn translate_range_avx2_nt(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1644 use std::arch::x86_64::*;
1645
1646 unsafe {
1647 let range = hi - lo;
1648 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1649 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1650 let offset_v = _mm256_set1_epi8(offset);
1651 let zero = _mm256_setzero_si256();
1652
1653 let len = src.len();
1654 let sp = src.as_ptr();
1655 let dp = dst.as_mut_ptr();
1656 let mut i = 0;
1657
1658 while i + 64 <= len {
1660 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1661 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1662 let bi0 = _mm256_add_epi8(in0, bias_v);
1663 let bi1 = _mm256_add_epi8(in1, bias_v);
1664 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1665 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1666 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1667 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1668 let om0 = _mm256_and_si256(m0, offset_v);
1669 let om1 = _mm256_and_si256(m1, offset_v);
1670 let r0 = _mm256_add_epi8(in0, om0);
1671 let r1 = _mm256_add_epi8(in1, om1);
1672 _mm256_stream_si256(dp.add(i) as *mut _, r0);
1673 _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
1674 i += 64;
1675 }
1676
1677 if i + 32 <= len {
1679 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1680 let biased = _mm256_add_epi8(input, bias_v);
1681 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1682 let mask = _mm256_cmpeq_epi8(gt, zero);
1683 let offset_masked = _mm256_and_si256(mask, offset_v);
1684 let result = _mm256_add_epi8(input, offset_masked);
1685 _mm256_stream_si256(dp.add(i) as *mut _, result);
1686 i += 32;
1687 }
1688
1689 if i + 16 <= len {
1691 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1692 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1693 let offset_v128 = _mm_set1_epi8(offset);
1694 let zero128 = _mm_setzero_si128();
1695
1696 let input = _mm_loadu_si128(sp.add(i) as *const _);
1697 let biased = _mm_add_epi8(input, bias_v128);
1698 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1699 let mask = _mm_cmpeq_epi8(gt, zero128);
1700 let offset_masked = _mm_and_si128(mask, offset_v128);
1701 let result = _mm_add_epi8(input, offset_masked);
1702 _mm_storeu_si128(dp.add(i) as *mut _, result);
1703 i += 16;
1704 }
1705
1706 while i < len {
1708 let b = *sp.add(i);
1709 *dp.add(i) = if b >= lo && b <= hi {
1710 b.wrapping_add(offset as u8)
1711 } else {
1712 b
1713 };
1714 i += 1;
1715 }
1716
1717 _mm_sfence();
1719 }
1720}
1721
1722#[cfg(target_arch = "x86_64")]
1723#[target_feature(enable = "sse2")]
1724unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1725 use std::arch::x86_64::*;
1726
1727 unsafe {
1728 let range = hi - lo;
1729 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1730 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1731 let offset_v = _mm_set1_epi8(offset);
1732 let zero = _mm_setzero_si128();
1733
1734 let len = src.len();
1735 let mut i = 0;
1736
1737 while i + 16 <= len {
1738 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
1739 let biased = _mm_add_epi8(input, bias_v);
1740 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1741 let mask = _mm_cmpeq_epi8(gt, zero);
1742 let offset_masked = _mm_and_si128(mask, offset_v);
1743 let result = _mm_add_epi8(input, offset_masked);
1744 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
1745 i += 16;
1746 }
1747
1748 while i < len {
1749 let b = *src.get_unchecked(i);
1750 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
1751 b.wrapping_add(offset as u8)
1752 } else {
1753 b
1754 };
1755 i += 1;
1756 }
1757 }
1758}
1759
1760#[cfg(target_arch = "aarch64")]
1763fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1764 unsafe { translate_range_neon(src, dst, lo, hi, offset) };
1765}
1766
1767#[cfg(target_arch = "aarch64")]
1768#[target_feature(enable = "neon")]
1769unsafe fn translate_range_neon(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1770 use std::arch::aarch64::*;
1771
1772 unsafe {
1773 let len = src.len();
1774 let sp = src.as_ptr();
1775 let dp = dst.as_mut_ptr();
1776 let lo_v = vdupq_n_u8(lo);
1777 let hi_v = vdupq_n_u8(hi);
1778 let offset_v = vdupq_n_s8(offset);
1779 let mut i = 0;
1780
1781 while i + 32 <= len {
1783 let in0 = vld1q_u8(sp.add(i));
1784 let in1 = vld1q_u8(sp.add(i + 16));
1785 let ge0 = vcgeq_u8(in0, lo_v);
1787 let le0 = vcleq_u8(in0, hi_v);
1788 let mask0 = vandq_u8(ge0, le0);
1789 let ge1 = vcgeq_u8(in1, lo_v);
1790 let le1 = vcleq_u8(in1, hi_v);
1791 let mask1 = vandq_u8(ge1, le1);
1792 let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
1794 let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
1795 let r0 = vaddq_u8(in0, off0);
1796 let r1 = vaddq_u8(in1, off1);
1797 vst1q_u8(dp.add(i), r0);
1798 vst1q_u8(dp.add(i + 16), r1);
1799 i += 32;
1800 }
1801
1802 if i + 16 <= len {
1803 let input = vld1q_u8(sp.add(i));
1804 let ge = vcgeq_u8(input, lo_v);
1805 let le = vcleq_u8(input, hi_v);
1806 let mask = vandq_u8(ge, le);
1807 let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
1808 vst1q_u8(dp.add(i), vaddq_u8(input, off));
1809 i += 16;
1810 }
1811
1812 while i < len {
1813 let b = *sp.add(i);
1814 *dp.add(i) = if b >= lo && b <= hi {
1815 b.wrapping_add(offset as u8)
1816 } else {
1817 b
1818 };
1819 i += 1;
1820 }
1821 }
1822}
1823
1824#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1826fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1827 let offset_u8 = offset as u8;
1828 let range = hi.wrapping_sub(lo);
1829 unsafe {
1830 let sp = src.as_ptr();
1831 let dp = dst.as_mut_ptr();
1832 let len = src.len();
1833 let mut i = 0;
1834 while i + 8 <= len {
1835 macro_rules! do_byte {
1836 ($off:expr) => {{
1837 let b = *sp.add(i + $off);
1838 let in_range = b.wrapping_sub(lo) <= range;
1839 *dp.add(i + $off) = if in_range {
1840 b.wrapping_add(offset_u8)
1841 } else {
1842 b
1843 };
1844 }};
1845 }
1846 do_byte!(0);
1847 do_byte!(1);
1848 do_byte!(2);
1849 do_byte!(3);
1850 do_byte!(4);
1851 do_byte!(5);
1852 do_byte!(6);
1853 do_byte!(7);
1854 i += 8;
1855 }
1856 while i < len {
1857 let b = *sp.add(i);
1858 let in_range = b.wrapping_sub(lo) <= range;
1859 *dp.add(i) = if in_range {
1860 b.wrapping_add(offset_u8)
1861 } else {
1862 b
1863 };
1864 i += 1;
1865 }
1866 }
1867}
1868
1869#[cfg(target_arch = "x86_64")]
1877fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1878 if get_simd_level() >= 3 {
1879 unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
1880 } else {
1881 unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
1882 }
1883}
1884
1885#[cfg(target_arch = "x86_64")]
1886#[target_feature(enable = "avx2")]
1887unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1888 use std::arch::x86_64::*;
1889
1890 unsafe {
1891 let range = hi - lo;
1892 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1893 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1894 let offset_v = _mm256_set1_epi8(offset);
1895 let zero = _mm256_setzero_si256();
1896
1897 let len = data.len();
1898 let ptr = data.as_mut_ptr();
1899 let mut i = 0;
1900
1901 while i + 64 <= len {
1903 let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1904 let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1905 let bi0 = _mm256_add_epi8(in0, bias_v);
1906 let bi1 = _mm256_add_epi8(in1, bias_v);
1907 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1908 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1909 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1910 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1911 let om0 = _mm256_and_si256(m0, offset_v);
1912 let om1 = _mm256_and_si256(m1, offset_v);
1913 let r0 = _mm256_add_epi8(in0, om0);
1914 let r1 = _mm256_add_epi8(in1, om1);
1915 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1916 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1917 i += 64;
1918 }
1919
1920 if i + 32 <= len {
1922 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1923 let biased = _mm256_add_epi8(input, bias_v);
1924 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1925 let mask = _mm256_cmpeq_epi8(gt, zero);
1926 let offset_masked = _mm256_and_si256(mask, offset_v);
1927 let result = _mm256_add_epi8(input, offset_masked);
1928 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1929 i += 32;
1930 }
1931
1932 if i + 16 <= len {
1933 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1934 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1935 let offset_v128 = _mm_set1_epi8(offset);
1936 let zero128 = _mm_setzero_si128();
1937
1938 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1939 let biased = _mm_add_epi8(input, bias_v128);
1940 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1941 let mask = _mm_cmpeq_epi8(gt, zero128);
1942 let offset_masked = _mm_and_si128(mask, offset_v128);
1943 let result = _mm_add_epi8(input, offset_masked);
1944 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1945 i += 16;
1946 }
1947
1948 while i < len {
1949 let b = *ptr.add(i);
1950 *ptr.add(i) = if b >= lo && b <= hi {
1951 b.wrapping_add(offset as u8)
1952 } else {
1953 b
1954 };
1955 i += 1;
1956 }
1957 }
1958}
1959
1960#[cfg(target_arch = "x86_64")]
1961#[target_feature(enable = "sse2")]
1962unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1963 use std::arch::x86_64::*;
1964
1965 unsafe {
1966 let range = hi - lo;
1967 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1968 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1969 let offset_v = _mm_set1_epi8(offset);
1970 let zero = _mm_setzero_si128();
1971
1972 let len = data.len();
1973 let ptr = data.as_mut_ptr();
1974 let mut i = 0;
1975
1976 while i + 16 <= len {
1977 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1978 let biased = _mm_add_epi8(input, bias_v);
1979 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1980 let mask = _mm_cmpeq_epi8(gt, zero);
1981 let offset_masked = _mm_and_si128(mask, offset_v);
1982 let result = _mm_add_epi8(input, offset_masked);
1983 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1984 i += 16;
1985 }
1986
1987 while i < len {
1988 let b = *ptr.add(i);
1989 *ptr.add(i) = if b >= lo && b <= hi {
1990 b.wrapping_add(offset as u8)
1991 } else {
1992 b
1993 };
1994 i += 1;
1995 }
1996 }
1997}
1998
1999#[cfg(target_arch = "aarch64")]
2000fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2001 unsafe { translate_range_neon_inplace(data, lo, hi, offset) };
2002}
2003
2004#[cfg(target_arch = "aarch64")]
2005#[target_feature(enable = "neon")]
2006unsafe fn translate_range_neon_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2007 use std::arch::aarch64::*;
2008
2009 unsafe {
2010 let len = data.len();
2011 let ptr = data.as_mut_ptr();
2012 let lo_v = vdupq_n_u8(lo);
2013 let hi_v = vdupq_n_u8(hi);
2014 let offset_v = vdupq_n_s8(offset);
2015 let mut i = 0;
2016
2017 while i + 32 <= len {
2018 let in0 = vld1q_u8(ptr.add(i));
2019 let in1 = vld1q_u8(ptr.add(i + 16));
2020 let ge0 = vcgeq_u8(in0, lo_v);
2021 let le0 = vcleq_u8(in0, hi_v);
2022 let mask0 = vandq_u8(ge0, le0);
2023 let ge1 = vcgeq_u8(in1, lo_v);
2024 let le1 = vcleq_u8(in1, hi_v);
2025 let mask1 = vandq_u8(ge1, le1);
2026 let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
2027 let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
2028 vst1q_u8(ptr.add(i), vaddq_u8(in0, off0));
2029 vst1q_u8(ptr.add(i + 16), vaddq_u8(in1, off1));
2030 i += 32;
2031 }
2032
2033 if i + 16 <= len {
2034 let input = vld1q_u8(ptr.add(i));
2035 let ge = vcgeq_u8(input, lo_v);
2036 let le = vcleq_u8(input, hi_v);
2037 let mask = vandq_u8(ge, le);
2038 let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
2039 vst1q_u8(ptr.add(i), vaddq_u8(input, off));
2040 i += 16;
2041 }
2042
2043 while i < len {
2044 let b = *ptr.add(i);
2045 if b >= lo && b <= hi {
2046 *ptr.add(i) = b.wrapping_add(offset as u8);
2047 }
2048 i += 1;
2049 }
2050 }
2051}
2052
2053#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
2054fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2055 let offset_u8 = offset as u8;
2056 let range = hi.wrapping_sub(lo);
2057 for b in data.iter_mut() {
2058 if b.wrapping_sub(lo) <= range {
2059 *b = b.wrapping_add(offset_u8);
2060 }
2061 }
2062}
2063
2064#[inline]
2074fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
2075 if chars.is_empty() {
2076 return None;
2077 }
2078 let mut lo = chars[0];
2079 let mut hi = chars[0];
2080 for &c in &chars[1..] {
2081 if c < lo {
2082 lo = c;
2083 }
2084 if c > hi {
2085 hi = c;
2086 }
2087 }
2088 if (hi as usize - lo as usize + 1) == chars.len() {
2091 Some((lo, hi))
2092 } else {
2093 None
2094 }
2095}
2096
2097#[cfg(target_arch = "x86_64")]
2101fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2102 if get_simd_level() >= 3 {
2103 unsafe { delete_range_avx2(src, dst, lo, hi) }
2104 } else {
2105 unsafe { delete_range_sse2(src, dst, lo, hi) }
2106 }
2107}
2108
2109#[cfg(target_arch = "x86_64")]
2110#[target_feature(enable = "avx2")]
2111unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2112 use std::arch::x86_64::*;
2113
2114 unsafe {
2115 let range = hi - lo;
2116 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2117 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2118 let zero = _mm256_setzero_si256();
2119
2120 let len = src.len();
2121 let sp = src.as_ptr();
2122 let dp = dst.as_mut_ptr();
2123 let mut ri = 0;
2124 let mut wp = 0;
2125
2126 while ri + 32 <= len {
2127 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
2128 let biased = _mm256_add_epi8(input, bias_v);
2129 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2131 let in_range = _mm256_cmpeq_epi8(gt, zero);
2133 let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
2135
2136 if keep_mask == 0xFFFFFFFF {
2137 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
2139 wp += 32;
2140 } else if keep_mask != 0 {
2141 let m0 = keep_mask as u8;
2146 let m1 = (keep_mask >> 8) as u8;
2147 let m2 = (keep_mask >> 16) as u8;
2148 let m3 = (keep_mask >> 24) as u8;
2149
2150 if m0 == 0xFF {
2151 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2152 } else if m0 != 0 {
2153 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2154 }
2155 let c0 = m0.count_ones() as usize;
2156
2157 if m1 == 0xFF {
2158 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2159 } else if m1 != 0 {
2160 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2161 }
2162 let c1 = m1.count_ones() as usize;
2163
2164 if m2 == 0xFF {
2165 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
2166 } else if m2 != 0 {
2167 compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
2168 }
2169 let c2 = m2.count_ones() as usize;
2170
2171 if m3 == 0xFF {
2172 std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
2173 } else if m3 != 0 {
2174 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
2175 }
2176 let c3 = m3.count_ones() as usize;
2177 wp += c0 + c1 + c2 + c3;
2178 }
2179 ri += 32;
2181 }
2182
2183 if ri + 16 <= len {
2185 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2186 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2187 let zero128 = _mm_setzero_si128();
2188
2189 let input = _mm_loadu_si128(sp.add(ri) as *const _);
2190 let biased = _mm_add_epi8(input, bias_v128);
2191 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
2192 let in_range = _mm_cmpeq_epi8(gt, zero128);
2193 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2194
2195 if keep_mask == 0xFFFF {
2196 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2197 wp += 16;
2198 } else if keep_mask != 0 {
2199 let m0 = keep_mask as u8;
2200 let m1 = (keep_mask >> 8) as u8;
2201 if m0 == 0xFF {
2202 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2203 } else if m0 != 0 {
2204 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2205 }
2206 let c0 = m0.count_ones() as usize;
2207 if m1 == 0xFF {
2208 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2209 } else if m1 != 0 {
2210 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2211 }
2212 wp += c0 + m1.count_ones() as usize;
2213 }
2214 ri += 16;
2215 }
2216
2217 while ri < len {
2219 let b = *sp.add(ri);
2220 *dp.add(wp) = b;
2221 wp += (b < lo || b > hi) as usize;
2222 ri += 1;
2223 }
2224
2225 wp
2226 }
2227}
2228
2229#[cfg(target_arch = "x86_64")]
2237#[inline(always)]
2238unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
2239 unsafe {
2240 let idx = COMPACT_LUT.get_unchecked(mask as usize);
2241 *dst = *src.add(*idx.get_unchecked(0) as usize);
2242 *dst.add(1) = *src.add(*idx.get_unchecked(1) as usize);
2243 *dst.add(2) = *src.add(*idx.get_unchecked(2) as usize);
2244 *dst.add(3) = *src.add(*idx.get_unchecked(3) as usize);
2245 *dst.add(4) = *src.add(*idx.get_unchecked(4) as usize);
2246 *dst.add(5) = *src.add(*idx.get_unchecked(5) as usize);
2247 *dst.add(6) = *src.add(*idx.get_unchecked(6) as usize);
2248 *dst.add(7) = *src.add(*idx.get_unchecked(7) as usize);
2249 }
2250}
2251
2252#[cfg(target_arch = "x86_64")]
2257#[target_feature(enable = "ssse3")]
2258#[inline]
2259unsafe fn compact_8bytes_simd(src: *const u8, dst: *mut u8, mask: u8) {
2260 use std::arch::x86_64::*;
2261 unsafe {
2262 let src_v = _mm_loadl_epi64(src as *const _);
2263 let shuf = _mm_loadl_epi64(COMPACT_LUT.get_unchecked(mask as usize).as_ptr() as *const _);
2264 let out_v = _mm_shuffle_epi8(src_v, shuf);
2265 _mm_storel_epi64(dst as *mut _, out_v);
2266 }
2267}
2268
2269#[cfg(target_arch = "x86_64")]
2270#[target_feature(enable = "sse2")]
2271unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2272 use std::arch::x86_64::*;
2273
2274 unsafe {
2275 let range = hi - lo;
2276 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2277 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2278 let zero = _mm_setzero_si128();
2279
2280 let len = src.len();
2281 let sp = src.as_ptr();
2282 let dp = dst.as_mut_ptr();
2283 let mut ri = 0;
2284 let mut wp = 0;
2285
2286 while ri + 16 <= len {
2287 let input = _mm_loadu_si128(sp.add(ri) as *const _);
2288 let biased = _mm_add_epi8(input, bias_v);
2289 let gt = _mm_cmpgt_epi8(biased, threshold_v);
2290 let in_range = _mm_cmpeq_epi8(gt, zero);
2291 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2292
2293 if keep_mask == 0xFFFF {
2294 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2296 wp += 16;
2297 } else if keep_mask != 0 {
2298 let m0 = keep_mask as u8;
2299 let m1 = (keep_mask >> 8) as u8;
2300 if m0 == 0xFF {
2301 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2302 } else if m0 != 0 {
2303 compact_8bytes(sp.add(ri), dp.add(wp), m0);
2304 }
2305 let c0 = m0.count_ones() as usize;
2306 if m1 == 0xFF {
2307 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2308 } else if m1 != 0 {
2309 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2310 }
2311 wp += c0 + m1.count_ones() as usize;
2312 }
2313 ri += 16;
2314 }
2315
2316 while ri < len {
2318 let b = *sp.add(ri);
2319 *dp.add(wp) = b;
2320 wp += (b < lo || b > hi) as usize;
2321 ri += 1;
2322 }
2323
2324 wp
2325 }
2326}
2327
2328#[cfg(not(target_arch = "x86_64"))]
2332fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2333 let len = src.len();
2334 let sp = src.as_ptr();
2335 let dp = dst.as_mut_ptr();
2336 let mut wp: usize = 0;
2337 let mut i: usize = 0;
2338
2339 while i + 8 <= len {
2341 unsafe {
2342 let b0 = *sp.add(i);
2343 *dp.add(wp) = b0;
2344 wp += (b0 < lo || b0 > hi) as usize;
2345 let b1 = *sp.add(i + 1);
2346 *dp.add(wp) = b1;
2347 wp += (b1 < lo || b1 > hi) as usize;
2348 let b2 = *sp.add(i + 2);
2349 *dp.add(wp) = b2;
2350 wp += (b2 < lo || b2 > hi) as usize;
2351 let b3 = *sp.add(i + 3);
2352 *dp.add(wp) = b3;
2353 wp += (b3 < lo || b3 > hi) as usize;
2354 let b4 = *sp.add(i + 4);
2355 *dp.add(wp) = b4;
2356 wp += (b4 < lo || b4 > hi) as usize;
2357 let b5 = *sp.add(i + 5);
2358 *dp.add(wp) = b5;
2359 wp += (b5 < lo || b5 > hi) as usize;
2360 let b6 = *sp.add(i + 6);
2361 *dp.add(wp) = b6;
2362 wp += (b6 < lo || b6 > hi) as usize;
2363 let b7 = *sp.add(i + 7);
2364 *dp.add(wp) = b7;
2365 wp += (b7 < lo || b7 > hi) as usize;
2366 }
2367 i += 8;
2368 }
2369
2370 while i < len {
2372 unsafe {
2373 let b = *sp.add(i);
2374 *dp.add(wp) = b;
2375 wp += (b < lo || b > hi) as usize;
2376 }
2377 i += 1;
2378 }
2379
2380 wp
2381}
2382
2383fn delete_range_streaming(
2388 lo: u8,
2389 hi: u8,
2390 reader: &mut impl Read,
2391 writer: &mut impl Write,
2392) -> io::Result<()> {
2393 let mut buf = alloc_uninit_vec(STREAM_BUF);
2394 loop {
2395 let n = read_once(reader, &mut buf)?;
2396 if n == 0 {
2397 break;
2398 }
2399 let wp = delete_range_inplace(&mut buf, n, lo, hi);
2400 if wp > 0 {
2401 writer.write_all(&buf[..wp])?;
2402 }
2403 }
2404 Ok(())
2405}
2406
2407#[inline]
2410fn delete_range_inplace(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2411 #[cfg(target_arch = "x86_64")]
2412 {
2413 let level = get_simd_level();
2414 if level >= 3 {
2415 return unsafe { delete_range_inplace_avx2(buf, n, lo, hi) };
2416 }
2417 }
2418 let ptr = buf.as_mut_ptr();
2420 let mut ri = 0;
2421 let mut wp = 0;
2422 unsafe {
2423 while ri + 8 <= n {
2424 let b0 = *ptr.add(ri);
2425 let b1 = *ptr.add(ri + 1);
2426 let b2 = *ptr.add(ri + 2);
2427 let b3 = *ptr.add(ri + 3);
2428 let b4 = *ptr.add(ri + 4);
2429 let b5 = *ptr.add(ri + 5);
2430 let b6 = *ptr.add(ri + 6);
2431 let b7 = *ptr.add(ri + 7);
2432 *ptr.add(wp) = b0;
2433 wp += (b0 < lo || b0 > hi) as usize;
2434 *ptr.add(wp) = b1;
2435 wp += (b1 < lo || b1 > hi) as usize;
2436 *ptr.add(wp) = b2;
2437 wp += (b2 < lo || b2 > hi) as usize;
2438 *ptr.add(wp) = b3;
2439 wp += (b3 < lo || b3 > hi) as usize;
2440 *ptr.add(wp) = b4;
2441 wp += (b4 < lo || b4 > hi) as usize;
2442 *ptr.add(wp) = b5;
2443 wp += (b5 < lo || b5 > hi) as usize;
2444 *ptr.add(wp) = b6;
2445 wp += (b6 < lo || b6 > hi) as usize;
2446 *ptr.add(wp) = b7;
2447 wp += (b7 < lo || b7 > hi) as usize;
2448 ri += 8;
2449 }
2450 while ri < n {
2451 let b = *ptr.add(ri);
2452 *ptr.add(wp) = b;
2453 wp += (b < lo || b > hi) as usize;
2454 ri += 1;
2455 }
2456 }
2457 wp
2458}
2459
2460#[cfg(target_arch = "x86_64")]
2463#[target_feature(enable = "avx2")]
2464unsafe fn delete_range_inplace_avx2(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2465 use std::arch::x86_64::*;
2466
2467 unsafe {
2468 let range = hi - lo;
2469 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2470 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2471 let zero = _mm256_setzero_si256();
2472
2473 let ptr = buf.as_mut_ptr();
2474 let mut ri = 0;
2475 let mut wp = 0;
2476
2477 while ri + 32 <= n {
2478 let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
2479 let biased = _mm256_add_epi8(input, bias_v);
2480 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2481 let in_range = _mm256_cmpeq_epi8(gt, zero);
2482 let del_mask = _mm256_movemask_epi8(in_range) as u32;
2483
2484 if del_mask == 0 {
2485 if wp != ri {
2487 std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
2488 }
2489 wp += 32;
2490 } else if del_mask != 0xFFFFFFFF {
2491 let keep_mask = !del_mask;
2496 let m0 = keep_mask as u8;
2497 let m1 = (keep_mask >> 8) as u8;
2498 let m2 = (keep_mask >> 16) as u8;
2499 let m3 = (keep_mask >> 24) as u8;
2500
2501 let c0 = m0.count_ones() as usize;
2502 let c1 = m1.count_ones() as usize;
2503 let c2 = m2.count_ones() as usize;
2504 let c3 = m3.count_ones() as usize;
2505
2506 if m0 == 0xFF {
2508 std::ptr::copy(ptr.add(ri), ptr.add(wp), 8);
2509 } else if m0 != 0 {
2510 let src_v = _mm_loadl_epi64(ptr.add(ri) as *const _);
2511 let shuf = _mm_loadl_epi64(COMPACT_LUT[m0 as usize].as_ptr() as *const _);
2512 let out_v = _mm_shuffle_epi8(src_v, shuf);
2513 _mm_storel_epi64(ptr.add(wp) as *mut _, out_v);
2514 }
2515
2516 if m1 == 0xFF {
2518 std::ptr::copy(ptr.add(ri + 8), ptr.add(wp + c0), 8);
2519 } else if m1 != 0 {
2520 let src_v = _mm_loadl_epi64(ptr.add(ri + 8) as *const _);
2521 let shuf = _mm_loadl_epi64(COMPACT_LUT[m1 as usize].as_ptr() as *const _);
2522 let out_v = _mm_shuffle_epi8(src_v, shuf);
2523 _mm_storel_epi64(ptr.add(wp + c0) as *mut _, out_v);
2524 }
2525
2526 if m2 == 0xFF {
2528 std::ptr::copy(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
2529 } else if m2 != 0 {
2530 let src_v = _mm_loadl_epi64(ptr.add(ri + 16) as *const _);
2531 let shuf = _mm_loadl_epi64(COMPACT_LUT[m2 as usize].as_ptr() as *const _);
2532 let out_v = _mm_shuffle_epi8(src_v, shuf);
2533 _mm_storel_epi64(ptr.add(wp + c0 + c1) as *mut _, out_v);
2534 }
2535
2536 if m3 == 0xFF {
2538 std::ptr::copy(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
2539 } else if m3 != 0 {
2540 let src_v = _mm_loadl_epi64(ptr.add(ri + 24) as *const _);
2541 let shuf = _mm_loadl_epi64(COMPACT_LUT[m3 as usize].as_ptr() as *const _);
2542 let out_v = _mm_shuffle_epi8(src_v, shuf);
2543 _mm_storel_epi64(ptr.add(wp + c0 + c1 + c2) as *mut _, out_v);
2544 }
2545
2546 wp += c0 + c1 + c2 + c3;
2547 }
2548 ri += 32;
2550 }
2551
2552 while ri < n {
2554 let b = *ptr.add(ri);
2555 *ptr.add(wp) = b;
2556 wp += (b < lo || b > hi) as usize;
2557 ri += 1;
2558 }
2559
2560 wp
2561 }
2562}
2563
2564pub fn translate(
2569 set1: &[u8],
2570 set2: &[u8],
2571 reader: &mut impl Read,
2572 writer: &mut impl Write,
2573) -> io::Result<()> {
2574 let table = build_translate_table(set1, set2);
2575
2576 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
2578 if is_identity {
2579 return passthrough_stream(reader, writer);
2580 }
2581
2582 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
2584 return translate_range_stream(lo, hi, offset, reader, writer);
2585 }
2586
2587 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
2590 return translate_range_to_constant_stream(lo, hi, replacement, reader, writer);
2591 }
2592
2593 let mut buf = alloc_uninit_vec(STREAM_BUF);
2598 loop {
2599 let n = read_once(reader, &mut buf)?;
2600 if n == 0 {
2601 break;
2602 }
2603 translate_and_write_table(&mut buf, n, &table, writer)?;
2604 }
2605 Ok(())
2606}
2607
2608#[inline]
2609fn translate_and_write_table(
2610 buf: &mut [u8],
2611 total: usize,
2612 table: &[u8; 256],
2613 writer: &mut impl Write,
2614) -> io::Result<()> {
2615 if total >= PARALLEL_THRESHOLD {
2616 let nt = rayon::current_num_threads().max(1);
2617 let cs = (total / nt).max(32 * 1024);
2618 buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2619 translate_inplace(chunk, table);
2620 });
2621 } else {
2622 translate_inplace(&mut buf[..total], table);
2623 }
2624 writer.write_all(&buf[..total])
2625}
2626
2627fn translate_range_stream(
2632 lo: u8,
2633 hi: u8,
2634 offset: i8,
2635 reader: &mut impl Read,
2636 writer: &mut impl Write,
2637) -> io::Result<()> {
2638 let mut buf = alloc_uninit_vec(STREAM_BUF);
2639 loop {
2640 let n = read_once(reader, &mut buf)?;
2641 if n == 0 {
2642 break;
2643 }
2644 translate_and_write_range(&mut buf, n, lo, hi, offset, writer)?;
2645 }
2646 Ok(())
2647}
2648
2649#[inline]
2650fn translate_and_write_range(
2651 buf: &mut [u8],
2652 total: usize,
2653 lo: u8,
2654 hi: u8,
2655 offset: i8,
2656 writer: &mut impl Write,
2657) -> io::Result<()> {
2658 if total >= PARALLEL_THRESHOLD {
2659 let nt = rayon::current_num_threads().max(1);
2660 let cs = (total / nt).max(32 * 1024);
2661 buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2662 translate_range_simd_inplace(chunk, lo, hi, offset);
2663 });
2664 } else {
2665 translate_range_simd_inplace(&mut buf[..total], lo, hi, offset);
2666 }
2667 writer.write_all(&buf[..total])
2668}
2669
2670fn translate_range_to_constant_stream(
2674 lo: u8,
2675 hi: u8,
2676 replacement: u8,
2677 reader: &mut impl Read,
2678 writer: &mut impl Write,
2679) -> io::Result<()> {
2680 let mut buf = alloc_uninit_vec(STREAM_BUF);
2681 loop {
2682 let n = read_once(reader, &mut buf)?;
2683 if n == 0 {
2684 break;
2685 }
2686 translate_and_write_range_const(&mut buf, n, lo, hi, replacement, writer)?;
2687 }
2688 Ok(())
2689}
2690
2691#[inline]
2692fn translate_and_write_range_const(
2693 buf: &mut [u8],
2694 total: usize,
2695 lo: u8,
2696 hi: u8,
2697 replacement: u8,
2698 writer: &mut impl Write,
2699) -> io::Result<()> {
2700 if total >= PARALLEL_THRESHOLD {
2701 let nt = rayon::current_num_threads().max(1);
2702 let cs = (total / nt).max(32 * 1024);
2703 buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2704 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
2705 });
2706 } else {
2707 translate_range_to_constant_simd_inplace(&mut buf[..total], lo, hi, replacement);
2708 }
2709 writer.write_all(&buf[..total])
2710}
2711
2712fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
2715 let mut buf = alloc_uninit_vec(STREAM_BUF);
2716 loop {
2717 let n = read_once(reader, &mut buf)?;
2718 if n == 0 {
2719 break;
2720 }
2721 writer.write_all(&buf[..n])?;
2722 }
2723 Ok(())
2724}
2725
2726#[inline]
2732fn read_once(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2733 loop {
2734 match reader.read(buf) {
2735 Ok(n) => return Ok(n),
2736 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2737 Err(e) => return Err(e),
2738 }
2739 }
2740}
2741
2742pub fn translate_squeeze(
2743 set1: &[u8],
2744 set2: &[u8],
2745 reader: &mut impl Read,
2746 writer: &mut impl Write,
2747) -> io::Result<()> {
2748 let table = build_translate_table(set1, set2);
2749 let squeeze_set = build_member_set(set2);
2750
2751 if set2.len() == 1 || (set2.len() > 1 && set2.iter().all(|&b| b == set2[0])) {
2754 let squeeze_ch = set2.last().copied().unwrap_or(0);
2755 return translate_squeeze_single_ch(&table, squeeze_ch, &squeeze_set, reader, writer);
2756 }
2757
2758 let range_info = detect_range_offset(&table);
2762 let range_const_info = if range_info.is_none() {
2763 detect_range_to_constant(&table)
2764 } else {
2765 None
2766 };
2767
2768 let mut buf = alloc_uninit_vec(STREAM_BUF);
2769 let mut last_squeezed: u16 = 256;
2770
2771 loop {
2772 let n = read_once(reader, &mut buf)?;
2773 if n == 0 {
2774 break;
2775 }
2776 let wp = translate_squeeze_process(
2777 &mut buf,
2778 n,
2779 &table,
2780 &squeeze_set,
2781 range_info,
2782 range_const_info,
2783 &mut last_squeezed,
2784 );
2785 if wp > 0 {
2786 writer.write_all(&buf[..wp])?;
2787 }
2788 }
2789 Ok(())
2790}
2791
2792#[inline]
2793fn translate_squeeze_process(
2794 buf: &mut [u8],
2795 n: usize,
2796 table: &[u8; 256],
2797 squeeze_set: &[u8; 32],
2798 range_info: Option<(u8, u8, i8)>,
2799 range_const_info: Option<(u8, u8, u8)>,
2800 last_squeezed: &mut u16,
2801) -> usize {
2802 if let Some((lo, hi, offset)) = range_info {
2804 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2805 } else if let Some((lo, hi, replacement)) = range_const_info {
2806 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2807 } else {
2808 translate_inplace(&mut buf[..n], table);
2809 }
2810 let mut wp = 0;
2812 unsafe {
2813 let ptr = buf.as_mut_ptr();
2814 let mut i = 0;
2815 while i + 8 <= n {
2816 macro_rules! squeeze_byte {
2817 ($off:expr) => {
2818 let b = *ptr.add(i + $off);
2819 if is_member(squeeze_set, b) {
2820 if *last_squeezed != b as u16 {
2821 *last_squeezed = b as u16;
2822 *ptr.add(wp) = b;
2823 wp += 1;
2824 }
2825 } else {
2826 *last_squeezed = 256;
2827 *ptr.add(wp) = b;
2828 wp += 1;
2829 }
2830 };
2831 }
2832 squeeze_byte!(0);
2833 squeeze_byte!(1);
2834 squeeze_byte!(2);
2835 squeeze_byte!(3);
2836 squeeze_byte!(4);
2837 squeeze_byte!(5);
2838 squeeze_byte!(6);
2839 squeeze_byte!(7);
2840 i += 8;
2841 }
2842 while i < n {
2843 let b = *ptr.add(i);
2844 if is_member(squeeze_set, b) {
2845 if *last_squeezed == b as u16 {
2846 i += 1;
2847 continue;
2848 }
2849 *last_squeezed = b as u16;
2850 } else {
2851 *last_squeezed = 256;
2852 }
2853 *ptr.add(wp) = b;
2854 wp += 1;
2855 i += 1;
2856 }
2857 }
2858 wp
2859}
2860
2861fn translate_squeeze_single_ch(
2865 table: &[u8; 256],
2866 squeeze_ch: u8,
2867 _squeeze_set: &[u8; 32],
2868 reader: &mut impl Read,
2869 writer: &mut impl Write,
2870) -> io::Result<()> {
2871 let range_info = detect_range_offset(table);
2872 let range_const_info = if range_info.is_none() {
2873 detect_range_to_constant(table)
2874 } else {
2875 None
2876 };
2877
2878 let pair = [squeeze_ch, squeeze_ch];
2879 let finder = memchr::memmem::Finder::new(&pair);
2880 let mut buf = alloc_uninit_vec(STREAM_BUF);
2881 let mut was_squeeze_char = false;
2882
2883 loop {
2884 let n = read_once(reader, &mut buf)?;
2885 if n == 0 {
2886 break;
2887 }
2888 let wp = translate_squeeze_single_process(
2889 &mut buf,
2890 n,
2891 table,
2892 squeeze_ch,
2893 &finder,
2894 range_info,
2895 range_const_info,
2896 &mut was_squeeze_char,
2897 );
2898 if wp > 0 {
2899 writer.write_all(&buf[..wp])?;
2900 }
2901 }
2902 Ok(())
2903}
2904
2905#[inline]
2906fn translate_squeeze_single_process(
2907 buf: &mut [u8],
2908 n: usize,
2909 table: &[u8; 256],
2910 squeeze_ch: u8,
2911 finder: &memchr::memmem::Finder<'_>,
2912 range_info: Option<(u8, u8, i8)>,
2913 range_const_info: Option<(u8, u8, u8)>,
2914 was_squeeze_char: &mut bool,
2915) -> usize {
2916 if let Some((lo, hi, offset)) = range_info {
2918 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2919 } else if let Some((lo, hi, replacement)) = range_const_info {
2920 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2921 } else {
2922 translate_inplace(&mut buf[..n], table);
2923 }
2924
2925 let mut i = 0;
2927 if *was_squeeze_char {
2928 while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2929 i += 1;
2930 }
2931 *was_squeeze_char = false;
2932 if i >= n {
2933 *was_squeeze_char = true;
2934 return 0;
2935 }
2936 }
2937
2938 let ptr = buf.as_mut_ptr();
2939 let mut wp = 0usize;
2940
2941 loop {
2942 match finder.find(&buf[i..n]) {
2943 Some(offset) => {
2944 let seg_end = i + offset + 1;
2945 let gap = seg_end - i;
2946 if gap > 0 {
2947 if wp != i {
2948 unsafe {
2949 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
2950 }
2951 }
2952 wp += gap;
2953 }
2954 i = seg_end;
2955 while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2956 i += 1;
2957 }
2958 if i >= n {
2959 *was_squeeze_char = true;
2960 break;
2961 }
2962 }
2963 None => {
2964 let rem = n - i;
2965 if rem > 0 {
2966 if wp != i {
2967 unsafe {
2968 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
2969 }
2970 }
2971 wp += rem;
2972 }
2973 *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == squeeze_ch;
2974 break;
2975 }
2976 }
2977 }
2978 wp
2979}
2980
2981pub fn delete(
2982 delete_chars: &[u8],
2983 reader: &mut impl Read,
2984 writer: &mut impl Write,
2985) -> io::Result<()> {
2986 if delete_chars.len() == 1 {
2987 return delete_single_streaming(delete_chars[0], reader, writer);
2988 }
2989 if delete_chars.len() <= 3 {
2990 return delete_multi_streaming(delete_chars, reader, writer);
2991 }
2992
2993 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
2997 return delete_range_streaming(lo, hi, reader, writer);
2998 }
2999
3000 let member = build_member_set(delete_chars);
3001 let mut buf = alloc_uninit_vec(STREAM_BUF);
3002 let mut outbuf = alloc_uninit_vec(STREAM_BUF);
3005
3006 loop {
3007 let n = read_once(reader, &mut buf)?;
3008 if n == 0 {
3009 break;
3010 }
3011 let wp = delete_bitset_dispatch(&buf[..n], &mut outbuf, &member);
3012 if wp > 0 {
3013 writer.write_all(&outbuf[..wp])?;
3014 }
3015 }
3016 Ok(())
3017}
3018
3019#[inline]
3020fn delete_bitset_dispatch(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3021 #[cfg(target_arch = "x86_64")]
3022 {
3023 if get_simd_level() >= 3 {
3024 return unsafe { delete_bitset_avx2_stream(src, dst, member) };
3025 }
3026 }
3027 delete_bitset_scalar(src, dst, member)
3028}
3029
3030#[inline]
3032fn delete_bitset_scalar(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3033 let n = src.len();
3034 let mut wp = 0;
3035 unsafe {
3036 let sp = src.as_ptr();
3037 let dp = dst.as_mut_ptr();
3038 let mut i = 0;
3039 while i + 8 <= n {
3040 let b0 = *sp.add(i);
3041 let b1 = *sp.add(i + 1);
3042 let b2 = *sp.add(i + 2);
3043 let b3 = *sp.add(i + 3);
3044 let b4 = *sp.add(i + 4);
3045 let b5 = *sp.add(i + 5);
3046 let b6 = *sp.add(i + 6);
3047 let b7 = *sp.add(i + 7);
3048 *dp.add(wp) = b0;
3049 wp += !is_member(member, b0) as usize;
3050 *dp.add(wp) = b1;
3051 wp += !is_member(member, b1) as usize;
3052 *dp.add(wp) = b2;
3053 wp += !is_member(member, b2) as usize;
3054 *dp.add(wp) = b3;
3055 wp += !is_member(member, b3) as usize;
3056 *dp.add(wp) = b4;
3057 wp += !is_member(member, b4) as usize;
3058 *dp.add(wp) = b5;
3059 wp += !is_member(member, b5) as usize;
3060 *dp.add(wp) = b6;
3061 wp += !is_member(member, b6) as usize;
3062 *dp.add(wp) = b7;
3063 wp += !is_member(member, b7) as usize;
3064 i += 8;
3065 }
3066 while i < n {
3067 let b = *sp.add(i);
3068 *dp.add(wp) = b;
3069 wp += !is_member(member, b) as usize;
3070 i += 1;
3071 }
3072 }
3073 wp
3074}
3075
3076#[cfg(target_arch = "x86_64")]
3079#[target_feature(enable = "avx2")]
3080unsafe fn delete_bitset_avx2_stream(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3081 use std::arch::x86_64::*;
3082
3083 unsafe {
3084 let n = src.len();
3085 let sp = src.as_ptr();
3086 let dp = dst.as_mut_ptr();
3087 let mut ri = 0;
3088 let mut wp = 0;
3089
3090 let member_v = _mm256_loadu_si256(member.as_ptr() as *const _);
3093
3094 let mask7 = _mm256_set1_epi8(7);
3097 let mask_0x1f = _mm256_set1_epi8(0x1F_u8 as i8);
3098
3099 let bit_table = _mm256_setr_epi8(
3102 1, 2, 4, 8, 16, 32, 64, -128i8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, -128i8,
3103 0, 0, 0, 0, 0, 0, 0, 0,
3104 );
3105
3106 while ri + 32 <= n {
3107 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
3108
3109 let byte_idx = _mm256_and_si256(_mm256_srli_epi16(input, 3), mask_0x1f);
3111 let bit_pos = _mm256_and_si256(input, mask7);
3113 let bit_mask = _mm256_shuffle_epi8(bit_table, bit_pos);
3115
3116 let member_lo = _mm256_broadcastsi128_si256(_mm256_castsi256_si128(member_v));
3124 let member_hi = _mm256_broadcastsi128_si256(_mm256_extracti128_si256(member_v, 1));
3125 let lo_mask = _mm256_set1_epi8(0x0F);
3126 let idx_lo = _mm256_and_si256(byte_idx, lo_mask);
3127 let shuffled_lo = _mm256_shuffle_epi8(member_lo, idx_lo);
3128 let shuffled_hi = _mm256_shuffle_epi8(member_hi, idx_lo);
3129 let use_hi = _mm256_slli_epi16(byte_idx, 3); let member_byte = _mm256_blendv_epi8(shuffled_lo, shuffled_hi, use_hi);
3132
3133 let test = _mm256_and_si256(member_byte, bit_mask);
3135 let is_zero = _mm256_cmpeq_epi8(test, _mm256_setzero_si256());
3136 let keep_mask = _mm256_movemask_epi8(is_zero) as u32;
3138
3139 if keep_mask == 0xFFFFFFFF {
3140 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
3142 wp += 32;
3143 } else if keep_mask != 0 {
3144 let m0 = keep_mask as u8;
3146 let m1 = (keep_mask >> 8) as u8;
3147 let m2 = (keep_mask >> 16) as u8;
3148 let m3 = (keep_mask >> 24) as u8;
3149
3150 if m0 == 0xFF {
3151 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
3152 } else if m0 != 0 {
3153 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
3154 }
3155 let c0 = m0.count_ones() as usize;
3156
3157 if m1 == 0xFF {
3158 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
3159 } else if m1 != 0 {
3160 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
3161 }
3162 let c1 = m1.count_ones() as usize;
3163
3164 if m2 == 0xFF {
3165 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
3166 } else if m2 != 0 {
3167 compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
3168 }
3169 let c2 = m2.count_ones() as usize;
3170
3171 if m3 == 0xFF {
3172 std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
3173 } else if m3 != 0 {
3174 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
3175 }
3176 let c3 = m3.count_ones() as usize;
3177 wp += c0 + c1 + c2 + c3;
3178 }
3179 ri += 32;
3181 }
3182
3183 while ri < n {
3185 let b = *sp.add(ri);
3186 *dp.add(wp) = b;
3187 wp += !is_member(member, b) as usize;
3188 ri += 1;
3189 }
3190
3191 wp
3192 }
3193}
3194
3195fn delete_single_streaming(
3196 ch: u8,
3197 reader: &mut impl Read,
3198 writer: &mut impl Write,
3199) -> io::Result<()> {
3200 let mut buf = alloc_uninit_vec(STREAM_BUF);
3201 loop {
3202 let n = read_once(reader, &mut buf)?;
3203 if n == 0 {
3204 break;
3205 }
3206 let wp = delete_single_inplace(&mut buf, n, ch);
3207 if wp > 0 {
3208 writer.write_all(&buf[..wp])?;
3209 }
3210 }
3211 Ok(())
3212}
3213
3214#[inline]
3216fn delete_single_inplace(buf: &mut [u8], n: usize, ch: u8) -> usize {
3217 let mut wp = 0;
3218 let mut i = 0;
3219 while i < n {
3220 match memchr::memchr(ch, &buf[i..n]) {
3221 Some(offset) => {
3222 if offset > 0 {
3223 if wp != i {
3224 unsafe {
3225 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3226 }
3227 }
3228 wp += offset;
3229 }
3230 i += offset + 1;
3231 }
3232 None => {
3233 let run_len = n - i;
3234 if run_len > 0 {
3235 if wp != i {
3236 unsafe {
3237 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3238 }
3239 }
3240 wp += run_len;
3241 }
3242 break;
3243 }
3244 }
3245 }
3246 wp
3247}
3248
3249fn delete_multi_streaming(
3250 chars: &[u8],
3251 reader: &mut impl Read,
3252 writer: &mut impl Write,
3253) -> io::Result<()> {
3254 let mut buf = alloc_uninit_vec(STREAM_BUF);
3255 loop {
3256 let n = read_once(reader, &mut buf)?;
3257 if n == 0 {
3258 break;
3259 }
3260 let wp = delete_multi_inplace(&mut buf, n, chars);
3261 if wp > 0 {
3262 writer.write_all(&buf[..wp])?;
3263 }
3264 }
3265 Ok(())
3266}
3267
3268#[inline]
3270fn delete_multi_inplace(buf: &mut [u8], n: usize, chars: &[u8]) -> usize {
3271 let mut wp = 0;
3272 let mut i = 0;
3273 while i < n {
3274 let found = if chars.len() == 2 {
3275 memchr::memchr2(chars[0], chars[1], &buf[i..n])
3276 } else {
3277 memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
3278 };
3279 match found {
3280 Some(offset) => {
3281 if offset > 0 {
3282 if wp != i {
3283 unsafe {
3284 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3285 }
3286 }
3287 wp += offset;
3288 }
3289 i += offset + 1;
3290 }
3291 None => {
3292 let run_len = n - i;
3293 if run_len > 0 {
3294 if wp != i {
3295 unsafe {
3296 std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3297 }
3298 }
3299 wp += run_len;
3300 }
3301 break;
3302 }
3303 }
3304 }
3305 wp
3306}
3307
3308pub fn delete_squeeze(
3309 delete_chars: &[u8],
3310 squeeze_chars: &[u8],
3311 reader: &mut impl Read,
3312 writer: &mut impl Write,
3313) -> io::Result<()> {
3314 let delete_set = build_member_set(delete_chars);
3315 let squeeze_set = build_member_set(squeeze_chars);
3316 let mut buf = alloc_uninit_vec(STREAM_BUF);
3317 let mut last_squeezed: u16 = 256;
3318
3319 loop {
3320 let n = read_once(reader, &mut buf)?;
3321 if n == 0 {
3322 break;
3323 }
3324 let wp = delete_squeeze_inplace(&mut buf, n, &delete_set, &squeeze_set, &mut last_squeezed);
3325 if wp > 0 {
3326 writer.write_all(&buf[..wp])?;
3327 }
3328 }
3329 Ok(())
3330}
3331
3332#[inline]
3333fn delete_squeeze_inplace(
3334 buf: &mut [u8],
3335 n: usize,
3336 delete_set: &[u8; 32],
3337 squeeze_set: &[u8; 32],
3338 last_squeezed: &mut u16,
3339) -> usize {
3340 let mut wp = 0;
3341 unsafe {
3342 let ptr = buf.as_mut_ptr();
3343 let mut i = 0;
3344 while i + 8 <= n {
3345 macro_rules! process_byte {
3346 ($off:expr) => {
3347 let b = *ptr.add(i + $off);
3348 if !is_member(delete_set, b) {
3349 if is_member(squeeze_set, b) {
3350 if *last_squeezed != b as u16 {
3351 *last_squeezed = b as u16;
3352 *ptr.add(wp) = b;
3353 wp += 1;
3354 }
3355 } else {
3356 *last_squeezed = 256;
3357 *ptr.add(wp) = b;
3358 wp += 1;
3359 }
3360 }
3361 };
3362 }
3363 process_byte!(0);
3364 process_byte!(1);
3365 process_byte!(2);
3366 process_byte!(3);
3367 process_byte!(4);
3368 process_byte!(5);
3369 process_byte!(6);
3370 process_byte!(7);
3371 i += 8;
3372 }
3373 while i < n {
3374 let b = *ptr.add(i);
3375 if !is_member(delete_set, b) {
3376 if is_member(squeeze_set, b) {
3377 if *last_squeezed != b as u16 {
3378 *last_squeezed = b as u16;
3379 *ptr.add(wp) = b;
3380 wp += 1;
3381 }
3382 } else {
3383 *last_squeezed = 256;
3384 *ptr.add(wp) = b;
3385 wp += 1;
3386 }
3387 }
3388 i += 1;
3389 }
3390 }
3391 wp
3392}
3393
3394pub fn squeeze(
3395 squeeze_chars: &[u8],
3396 reader: &mut impl Read,
3397 writer: &mut impl Write,
3398) -> io::Result<()> {
3399 if squeeze_chars.len() == 1 {
3400 return squeeze_single_stream(squeeze_chars[0], reader, writer);
3401 }
3402
3403 if squeeze_chars.len() <= 3 {
3406 return squeeze_multi_stream(squeeze_chars, reader, writer);
3407 }
3408
3409 let member = build_member_set(squeeze_chars);
3410 let mut buf = alloc_uninit_vec(STREAM_BUF);
3411 let mut last_squeezed: u16 = 256;
3412
3413 loop {
3414 let n = read_once(reader, &mut buf)?;
3415 if n == 0 {
3416 break;
3417 }
3418 let wp = squeeze_inplace_bitset(&mut buf, n, &member, &mut last_squeezed);
3419 if wp > 0 {
3420 writer.write_all(&buf[..wp])?;
3421 }
3422 }
3423 Ok(())
3424}
3425
3426#[inline]
3427fn squeeze_inplace_bitset(
3428 buf: &mut [u8],
3429 n: usize,
3430 member: &[u8; 32],
3431 last_squeezed: &mut u16,
3432) -> usize {
3433 let mut wp = 0;
3434 unsafe {
3435 let ptr = buf.as_mut_ptr();
3436 for i in 0..n {
3437 let b = *ptr.add(i);
3438 if is_member(member, b) {
3439 if *last_squeezed == b as u16 {
3440 continue;
3441 }
3442 *last_squeezed = b as u16;
3443 } else {
3444 *last_squeezed = 256;
3445 }
3446 *ptr.add(wp) = b;
3447 wp += 1;
3448 }
3449 }
3450 wp
3451}
3452
3453fn squeeze_multi_stream(
3457 chars: &[u8],
3458 reader: &mut impl Read,
3459 writer: &mut impl Write,
3460) -> io::Result<()> {
3461 let c0 = chars[0];
3462 let c1 = chars[1];
3463 let c2 = if chars.len() >= 3 {
3464 Some(chars[2])
3465 } else {
3466 None
3467 };
3468
3469 let mut buf = alloc_uninit_vec(STREAM_BUF);
3470 let mut last_squeezed: u16 = 256;
3471
3472 loop {
3473 let n = read_once(reader, &mut buf)?;
3474 if n == 0 {
3475 break;
3476 }
3477 let wp = squeeze_multi_compact(&mut buf, n, c0, c1, c2, &mut last_squeezed);
3478 if wp > 0 {
3479 writer.write_all(&buf[..wp])?;
3480 }
3481 }
3482 Ok(())
3483}
3484
3485#[inline]
3487fn squeeze_multi_compact(
3488 buf: &mut [u8],
3489 n: usize,
3490 c0: u8,
3491 c1: u8,
3492 c2: Option<u8>,
3493 last_squeezed: &mut u16,
3494) -> usize {
3495 let ptr = buf.as_mut_ptr();
3496 let mut wp = 0usize;
3497 let mut cursor = 0usize;
3498
3499 while cursor < n {
3500 let found = if let Some(c) = c2 {
3501 memchr::memchr3(c0, c1, c, &buf[cursor..n])
3502 } else {
3503 memchr::memchr2(c0, c1, &buf[cursor..n])
3504 };
3505 match found {
3506 Some(offset) => {
3507 let pos = cursor + offset;
3508 let b = unsafe { *ptr.add(pos) };
3509
3510 let gap = pos - cursor;
3511 if gap > 0 {
3512 if wp != cursor {
3513 unsafe {
3514 std::ptr::copy(ptr.add(cursor), ptr.add(wp), gap);
3515 }
3516 }
3517 wp += gap;
3518 *last_squeezed = 256;
3519 }
3520
3521 if *last_squeezed != b as u16 {
3522 unsafe { *ptr.add(wp) = b };
3523 wp += 1;
3524 *last_squeezed = b as u16;
3525 }
3526
3527 cursor = pos + 1;
3528 while cursor < n && unsafe { *ptr.add(cursor) } == b {
3529 cursor += 1;
3530 }
3531 }
3532 None => {
3533 let rem = n - cursor;
3534 if rem > 0 {
3535 if wp != cursor {
3536 unsafe {
3537 std::ptr::copy(ptr.add(cursor), ptr.add(wp), rem);
3538 }
3539 }
3540 wp += rem;
3541 *last_squeezed = 256;
3542 }
3543 break;
3544 }
3545 }
3546 }
3547 wp
3548}
3549
3550fn squeeze_single_stream(
3551 ch: u8,
3552 reader: &mut impl Read,
3553 writer: &mut impl Write,
3554) -> io::Result<()> {
3555 let mut buf = alloc_uninit_vec(STREAM_BUF);
3556 let mut was_squeeze_char = false;
3557
3558 #[cfg(target_arch = "x86_64")]
3560 if get_simd_level() >= 3 {
3561 loop {
3562 let n = read_once(reader, &mut buf)?;
3563 if n == 0 {
3564 break;
3565 }
3566 let wp = unsafe { squeeze_single_avx2_inplace(&mut buf, n, ch, &mut was_squeeze_char) };
3567 if wp > 0 {
3568 writer.write_all(&buf[..wp])?;
3569 }
3570 }
3571 return Ok(());
3572 }
3573
3574 let pair = [ch, ch];
3576 let finder = memchr::memmem::Finder::new(&pair);
3577 loop {
3578 let n = read_once(reader, &mut buf)?;
3579 if n == 0 {
3580 break;
3581 }
3582 let wp = squeeze_single_compact(&mut buf, n, ch, &finder, &mut was_squeeze_char);
3583 if wp > 0 {
3584 writer.write_all(&buf[..wp])?;
3585 }
3586 }
3587 Ok(())
3588}
3589
3590#[inline]
3592fn squeeze_single_compact(
3593 buf: &mut [u8],
3594 n: usize,
3595 ch: u8,
3596 finder: &memchr::memmem::Finder<'_>,
3597 was_squeeze_char: &mut bool,
3598) -> usize {
3599 let mut i = 0;
3600
3601 if *was_squeeze_char {
3603 while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3604 i += 1;
3605 }
3606 *was_squeeze_char = false;
3607 if i >= n {
3608 *was_squeeze_char = true;
3609 return 0;
3610 }
3611 }
3612
3613 let ptr = buf.as_mut_ptr();
3614 let mut wp = 0usize;
3615
3616 loop {
3617 match finder.find(&buf[i..n]) {
3618 Some(offset) => {
3619 let seg_end = i + offset + 1;
3620 let gap = seg_end - i;
3621 if gap > 0 {
3622 if wp != i {
3623 unsafe {
3624 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
3625 }
3626 }
3627 wp += gap;
3628 }
3629 i = seg_end;
3630 while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3631 i += 1;
3632 }
3633 if i >= n {
3634 *was_squeeze_char = true;
3635 break;
3636 }
3637 }
3638 None => {
3639 let rem = n - i;
3640 if rem > 0 {
3641 if wp != i {
3642 unsafe {
3643 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
3644 }
3645 }
3646 wp += rem;
3647 }
3648 *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == ch;
3649 break;
3650 }
3651 }
3652 }
3653 wp
3654}
3655
3656#[cfg(target_arch = "x86_64")]
3667#[target_feature(enable = "avx2")]
3668unsafe fn squeeze_single_avx2_inplace(
3669 buf: &mut [u8],
3670 n: usize,
3671 ch: u8,
3672 was_squeeze_char: &mut bool,
3673) -> usize {
3674 use std::arch::x86_64::*;
3675
3676 unsafe {
3677 let ch_v = _mm256_set1_epi8(ch as i8);
3678 let ptr = buf.as_mut_ptr();
3679 let mut ri = 0;
3680 let mut wp = 0;
3681 let mut carry: u32 = if *was_squeeze_char { 1 } else { 0 };
3682
3683 while ri + 32 <= n {
3684 let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
3685 let cmp = _mm256_cmpeq_epi8(input, ch_v);
3686 let sq_mask = _mm256_movemask_epi8(cmp) as u32;
3687
3688 let prev_sq_mask = (sq_mask << 1) | carry;
3691
3692 let remove_mask = sq_mask & prev_sq_mask;
3694
3695 carry = (sq_mask >> 31) & 1;
3697
3698 if remove_mask == 0 {
3699 if wp != ri {
3701 std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
3702 }
3703 wp += 32;
3704 } else if remove_mask != 0xFFFFFFFF {
3705 let keep_mask = !remove_mask;
3707 let m0 = keep_mask as u8;
3708 let m1 = (keep_mask >> 8) as u8;
3709 let m2 = (keep_mask >> 16) as u8;
3710 let m3 = (keep_mask >> 24) as u8;
3711
3712 if m0 == 0xFF {
3713 std::ptr::copy_nonoverlapping(ptr.add(ri), ptr.add(wp), 8);
3714 } else if m0 != 0 {
3715 compact_8bytes_simd(ptr.add(ri), ptr.add(wp), m0);
3716 }
3717 let c0 = m0.count_ones() as usize;
3718
3719 if m1 == 0xFF {
3720 std::ptr::copy_nonoverlapping(ptr.add(ri + 8), ptr.add(wp + c0), 8);
3721 } else if m1 != 0 {
3722 compact_8bytes_simd(ptr.add(ri + 8), ptr.add(wp + c0), m1);
3723 }
3724 let c1 = m1.count_ones() as usize;
3725
3726 if m2 == 0xFF {
3727 std::ptr::copy_nonoverlapping(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
3728 } else if m2 != 0 {
3729 compact_8bytes_simd(ptr.add(ri + 16), ptr.add(wp + c0 + c1), m2);
3730 }
3731 let c2 = m2.count_ones() as usize;
3732
3733 if m3 == 0xFF {
3734 std::ptr::copy_nonoverlapping(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
3735 } else if m3 != 0 {
3736 compact_8bytes_simd(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), m3);
3737 }
3738 let c3 = m3.count_ones() as usize;
3739 wp += c0 + c1 + c2 + c3;
3740 }
3741 ri += 32;
3744 }
3745
3746 if ri + 16 <= n {
3748 let ch_v128 = _mm_set1_epi8(ch as i8);
3749 let input = _mm_loadu_si128(ptr.add(ri) as *const _);
3750 let cmp = _mm_cmpeq_epi8(input, ch_v128);
3751 let sq_mask = _mm_movemask_epi8(cmp) as u32 & 0xFFFF;
3752 let prev_sq_mask = (sq_mask << 1) | carry;
3753 let remove_mask = sq_mask & prev_sq_mask;
3754 carry = (sq_mask >> 15) & 1;
3755
3756 if remove_mask == 0 {
3757 if wp != ri {
3758 std::ptr::copy(ptr.add(ri), ptr.add(wp), 16);
3759 }
3760 wp += 16;
3761 } else if remove_mask != 0xFFFF {
3762 let keep_mask = !remove_mask;
3763 let m0 = keep_mask as u8;
3764 let m1 = (keep_mask >> 8) as u8;
3765 if m0 == 0xFF {
3766 std::ptr::copy_nonoverlapping(ptr.add(ri), ptr.add(wp), 8);
3767 } else if m0 != 0 {
3768 compact_8bytes_simd(ptr.add(ri), ptr.add(wp), m0);
3769 }
3770 let c0 = m0.count_ones() as usize;
3771 if m1 == 0xFF {
3772 std::ptr::copy_nonoverlapping(ptr.add(ri + 8), ptr.add(wp + c0), 8);
3773 } else if m1 != 0 {
3774 compact_8bytes_simd(ptr.add(ri + 8), ptr.add(wp + c0), m1);
3775 }
3776 wp += c0 + m1.count_ones() as usize;
3777 }
3778 ri += 16;
3779 }
3780
3781 while ri < n {
3783 let b = *ptr.add(ri);
3784 if b == ch && carry != 0 {
3785 } else {
3787 *ptr.add(wp) = b;
3788 wp += 1;
3789 }
3790 carry = if b == ch { 1 } else { 0 };
3791 ri += 1;
3792 }
3793
3794 *was_squeeze_char = carry != 0;
3795 wp
3796 }
3797}
3798
3799pub fn translate_owned(
3807 set1: &[u8],
3808 set2: &[u8],
3809 data: &mut [u8],
3810 writer: &mut impl Write,
3811) -> io::Result<()> {
3812 let table = build_translate_table(set1, set2);
3813
3814 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3816 if is_identity {
3817 return writer.write_all(data);
3818 }
3819
3820 const OWNED_PARALLEL_MIN: usize = 64 * 1024 * 1024;
3824
3825 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3827 if data.len() >= OWNED_PARALLEL_MIN {
3828 let n_threads = rayon::current_num_threads().max(1);
3829 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3830 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3831 translate_range_simd_inplace(chunk, lo, hi, offset);
3832 });
3833 } else {
3834 translate_range_simd_inplace(data, lo, hi, offset);
3835 }
3836 return writer.write_all(data);
3837 }
3838
3839 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3841 if data.len() >= OWNED_PARALLEL_MIN {
3842 let n_threads = rayon::current_num_threads().max(1);
3843 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3844 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3845 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
3846 });
3847 } else {
3848 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3849 }
3850 return writer.write_all(data);
3851 }
3852
3853 if data.len() >= OWNED_PARALLEL_MIN {
3855 let n_threads = rayon::current_num_threads().max(1);
3856 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3857 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3858 translate_inplace(chunk, &table);
3859 });
3860 } else {
3861 translate_inplace(data, &table);
3862 }
3863 writer.write_all(data)
3864}
3865
3866pub fn translate_mmap(
3880 set1: &[u8],
3881 set2: &[u8],
3882 data: &[u8],
3883 writer: &mut impl Write,
3884) -> io::Result<()> {
3885 let table = build_translate_table(set1, set2);
3886
3887 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3889 if is_identity {
3890 return writer.write_all(data);
3891 }
3892
3893 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3895 return translate_mmap_range(data, writer, lo, hi, offset);
3896 }
3897
3898 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3900 return translate_mmap_range_to_constant(data, writer, lo, hi, replacement);
3901 }
3902
3903 translate_mmap_table(data, writer, &table)
3905}
3906
3907fn translate_mmap_range(
3909 data: &[u8],
3910 writer: &mut impl Write,
3911 lo: u8,
3912 hi: u8,
3913 offset: i8,
3914) -> io::Result<()> {
3915 if data.len() >= PARALLEL_THRESHOLD && data.len() <= SINGLE_ALLOC_LIMIT {
3918 let mut buf = alloc_uninit_vec(data.len());
3919 let n_threads = rayon::current_num_threads().max(1);
3920 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3921
3922 data.par_chunks(chunk_size)
3924 .zip(buf.par_chunks_mut(chunk_size))
3925 .for_each(|(src_chunk, dst_chunk)| {
3926 translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
3927 });
3928
3929 return writer.write_all(&buf);
3930 }
3931
3932 const CHUNK: usize = 256 * 1024;
3935 let buf_size = data.len().min(CHUNK);
3936 let mut buf = alloc_uninit_vec(buf_size);
3937 for chunk in data.chunks(CHUNK) {
3938 translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
3939 writer.write_all(&buf[..chunk.len()])?;
3940 }
3941 Ok(())
3942}
3943
3944fn translate_mmap_range_to_constant(
3947 data: &[u8],
3948 writer: &mut impl Write,
3949 lo: u8,
3950 hi: u8,
3951 replacement: u8,
3952) -> io::Result<()> {
3953 if data.len() >= PARALLEL_THRESHOLD && data.len() <= SINGLE_ALLOC_LIMIT {
3956 let mut buf = alloc_uninit_vec(data.len());
3957 let n_threads = rayon::current_num_threads().max(1);
3958 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3959
3960 data.par_chunks(chunk_size)
3962 .zip(buf.par_chunks_mut(chunk_size))
3963 .for_each(|(src_chunk, dst_chunk)| {
3964 dst_chunk[..src_chunk.len()].copy_from_slice(src_chunk);
3965 translate_range_to_constant_simd_inplace(
3966 &mut dst_chunk[..src_chunk.len()],
3967 lo,
3968 hi,
3969 replacement,
3970 );
3971 });
3972
3973 return writer.write_all(&buf);
3974 }
3975
3976 const CHUNK: usize = 256 * 1024;
3979 let buf_size = data.len().min(CHUNK);
3980 let mut buf = alloc_uninit_vec(buf_size);
3981 for chunk in data.chunks(CHUNK) {
3982 buf[..chunk.len()].copy_from_slice(chunk);
3983 translate_range_to_constant_simd_inplace(&mut buf[..chunk.len()], lo, hi, replacement);
3984 writer.write_all(&buf[..chunk.len()])?;
3985 }
3986 Ok(())
3987}
3988
3989fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
3991 if data.len() >= PARALLEL_THRESHOLD && data.len() <= SINGLE_ALLOC_LIMIT {
3994 let mut buf = alloc_uninit_vec(data.len());
3995 let n_threads = rayon::current_num_threads().max(1);
3996 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3997
3998 data.par_chunks(chunk_size)
3999 .zip(buf.par_chunks_mut(chunk_size))
4000 .for_each(|(src_chunk, dst_chunk)| {
4001 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
4002 });
4003
4004 return writer.write_all(&buf);
4005 }
4006
4007 const CHUNK: usize = 256 * 1024;
4010 let buf_size = data.len().min(CHUNK);
4011 let mut buf = alloc_uninit_vec(buf_size);
4012 for chunk in data.chunks(CHUNK) {
4013 translate_to(chunk, &mut buf[..chunk.len()], table);
4014 writer.write_all(&buf[..chunk.len()])?;
4015 }
4016 Ok(())
4017}
4018
4019pub fn translate_mmap_inplace(
4026 set1: &[u8],
4027 set2: &[u8],
4028 data: &mut [u8],
4029 writer: &mut impl Write,
4030) -> io::Result<()> {
4031 let table = build_translate_table(set1, set2);
4032
4033 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
4035 if is_identity {
4036 return writer.write_all(data);
4037 }
4038
4039 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
4046 if data.len() >= PARALLEL_THRESHOLD {
4047 let n_threads = rayon::current_num_threads().max(1);
4048 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4049 data.par_chunks_mut(chunk_size)
4050 .for_each(|chunk| translate_range_simd_inplace(chunk, lo, hi, offset));
4051 } else {
4052 translate_range_simd_inplace(data, lo, hi, offset);
4053 }
4054 return writer.write_all(data);
4055 }
4056
4057 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
4059 if data.len() >= PARALLEL_THRESHOLD {
4060 let n_threads = rayon::current_num_threads().max(1);
4061 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4062 data.par_chunks_mut(chunk_size).for_each(|chunk| {
4063 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement)
4064 });
4065 } else {
4066 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
4067 }
4068 return writer.write_all(data);
4069 }
4070
4071 if data.len() >= PARALLEL_THRESHOLD {
4073 let n_threads = rayon::current_num_threads().max(1);
4074 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4075 data.par_chunks_mut(chunk_size)
4076 .for_each(|chunk| translate_inplace(chunk, &table));
4077 } else {
4078 translate_inplace(data, &table);
4079 }
4080 writer.write_all(data)
4081}
4082
4083fn translate_to_separate_buf(
4091 data: &[u8],
4092 table: &[u8; 256],
4093 writer: &mut impl Write,
4094) -> io::Result<()> {
4095 let range_info = detect_range_offset(table);
4096 let const_info = if range_info.is_none() {
4097 detect_range_to_constant(table)
4098 } else {
4099 None
4100 };
4101
4102 if data.len() >= PARALLEL_THRESHOLD && data.len() <= SINGLE_ALLOC_LIMIT {
4104 let mut out_buf = alloc_uninit_vec(data.len());
4106 let n_threads = rayon::current_num_threads().max(1);
4107 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4108
4109 if let Some((lo, hi, offset)) = range_info {
4110 data.par_chunks(chunk_size)
4111 .zip(out_buf.par_chunks_mut(chunk_size))
4112 .for_each(|(src, dst)| {
4113 translate_range_simd(src, &mut dst[..src.len()], lo, hi, offset);
4114 });
4115 } else if let Some((lo, hi, replacement)) = const_info {
4116 data.par_chunks(chunk_size)
4117 .zip(out_buf.par_chunks_mut(chunk_size))
4118 .for_each(|(src, dst)| {
4119 translate_range_to_constant_simd(
4120 src,
4121 &mut dst[..src.len()],
4122 lo,
4123 hi,
4124 replacement,
4125 );
4126 });
4127 } else {
4128 data.par_chunks(chunk_size)
4129 .zip(out_buf.par_chunks_mut(chunk_size))
4130 .for_each(|(src, dst)| {
4131 translate_to(src, &mut dst[..src.len()], table);
4132 });
4133 }
4134 return writer.write_all(&out_buf);
4135 }
4136
4137 const CHUNK: usize = 256 * 1024;
4140 let buf_size = data.len().min(CHUNK);
4141 let mut out_buf = alloc_uninit_vec(buf_size);
4142 for chunk in data.chunks(CHUNK) {
4143 if let Some((lo, hi, offset)) = range_info {
4144 translate_range_simd(chunk, &mut out_buf[..chunk.len()], lo, hi, offset);
4145 } else if let Some((lo, hi, replacement)) = const_info {
4146 translate_range_to_constant_simd(
4147 chunk,
4148 &mut out_buf[..chunk.len()],
4149 lo,
4150 hi,
4151 replacement,
4152 );
4153 } else {
4154 translate_to(chunk, &mut out_buf[..chunk.len()], table);
4155 }
4156 writer.write_all(&out_buf[..chunk.len()])?;
4157 }
4158 Ok(())
4159}
4160
4161pub fn translate_mmap_readonly(
4165 set1: &[u8],
4166 set2: &[u8],
4167 data: &[u8],
4168 writer: &mut impl Write,
4169) -> io::Result<()> {
4170 let table = build_translate_table(set1, set2);
4171
4172 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
4174 if is_identity {
4175 return writer.write_all(data);
4176 }
4177
4178 translate_to_separate_buf(data, &table, writer)
4179}
4180
4181pub fn translate_squeeze_mmap(
4187 set1: &[u8],
4188 set2: &[u8],
4189 data: &[u8],
4190 writer: &mut impl Write,
4191) -> io::Result<()> {
4192 let table = build_translate_table(set1, set2);
4193 let squeeze_set = build_member_set(set2);
4194
4195 if data.len() >= PARALLEL_THRESHOLD && data.len() <= SINGLE_ALLOC_LIMIT {
4200 let mut translated = alloc_uninit_vec(data.len());
4202 let range_info = detect_range_offset(&table);
4203 let n_threads = rayon::current_num_threads().max(1);
4204 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4205
4206 if let Some((lo, hi, offset)) = range_info {
4207 data.par_chunks(chunk_size)
4208 .zip(translated.par_chunks_mut(chunk_size))
4209 .for_each(|(src_chunk, dst_chunk)| {
4210 translate_range_simd(
4211 src_chunk,
4212 &mut dst_chunk[..src_chunk.len()],
4213 lo,
4214 hi,
4215 offset,
4216 );
4217 });
4218 } else {
4219 data.par_chunks(chunk_size)
4220 .zip(translated.par_chunks_mut(chunk_size))
4221 .for_each(|(src_chunk, dst_chunk)| {
4222 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
4223 });
4224 }
4225
4226 let mut last_squeezed: u16 = 256;
4230 let len = translated.len();
4231 let mut wp = 0;
4232 unsafe {
4233 let ptr = translated.as_mut_ptr();
4234 let mut i = 0;
4235 while i < len {
4236 let b = *ptr.add(i);
4237 if is_member(&squeeze_set, b) {
4238 if last_squeezed == b as u16 {
4239 i += 1;
4240 continue;
4241 }
4242 last_squeezed = b as u16;
4243 } else {
4244 last_squeezed = 256;
4245 }
4246 *ptr.add(wp) = b;
4247 wp += 1;
4248 i += 1;
4249 }
4250 }
4251 return writer.write_all(&translated[..wp]);
4252 }
4253
4254 if data.len() <= SINGLE_ALLOC_LIMIT {
4256 let mut buf = alloc_uninit_vec(data.len());
4257 translate_to(data, &mut buf, &table);
4258 let mut last_squeezed: u16 = 256;
4259 let mut wp = 0;
4260 unsafe {
4261 let ptr = buf.as_mut_ptr();
4262 for i in 0..data.len() {
4263 let b = *ptr.add(i);
4264 if is_member(&squeeze_set, b) {
4265 if last_squeezed == b as u16 {
4266 continue;
4267 }
4268 last_squeezed = b as u16;
4269 } else {
4270 last_squeezed = 256;
4271 }
4272 *ptr.add(wp) = b;
4273 wp += 1;
4274 }
4275 }
4276 return writer.write_all(&buf[..wp]);
4277 }
4278
4279 const CHUNK: usize = 256 * 1024;
4281 let mut last_squeezed: u16 = 256;
4282 let mut buf = alloc_uninit_vec(CHUNK);
4283 for chunk in data.chunks(CHUNK) {
4284 translate_to(chunk, &mut buf[..chunk.len()], &table);
4285 let mut wp = 0;
4286 for i in 0..chunk.len() {
4287 let b = buf[i];
4288 if is_member(&squeeze_set, b) {
4289 if last_squeezed == b as u16 {
4290 continue;
4291 }
4292 last_squeezed = b as u16;
4293 } else {
4294 last_squeezed = 256;
4295 }
4296 buf[wp] = b;
4297 wp += 1;
4298 }
4299 writer.write_all(&buf[..wp])?;
4300 }
4301 Ok(())
4302}
4303
4304pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4310 if delete_chars.len() == 1 {
4311 return delete_single_char_mmap(delete_chars[0], data, writer);
4312 }
4313 if delete_chars.len() <= 3 {
4314 return delete_multi_memchr_mmap(delete_chars, data, writer);
4315 }
4316
4317 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
4319 return delete_range_mmap(data, writer, lo, hi);
4320 }
4321
4322 let member = build_member_set(delete_chars);
4323
4324 let sample_size = data.len().min(1024);
4329 let sample_deletes = data[..sample_size]
4330 .iter()
4331 .filter(|&&b| is_member(&member, b))
4332 .count();
4333 let estimated_deletes = if sample_size > 0 {
4334 data.len() * sample_deletes / sample_size
4335 } else {
4336 data.len()
4337 };
4338
4339 if estimated_deletes < MAX_IOV / 2 {
4340 return delete_bitset_zerocopy(data, &member, writer);
4341 }
4342
4343 if data.len() >= PARALLEL_THRESHOLD && data.len() <= SINGLE_ALLOC_LIMIT {
4346 let n_threads = rayon::current_num_threads().max(1);
4347 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4348
4349 let mut outbuf = alloc_uninit_vec(data.len());
4350 let chunk_lens: Vec<usize> = data
4351 .par_chunks(chunk_size)
4352 .zip(outbuf.par_chunks_mut(chunk_size))
4353 .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
4354 .collect();
4355
4356 let slices: Vec<std::io::IoSlice> = chunk_lens
4360 .iter()
4361 .enumerate()
4362 .filter(|&(_, &len)| len > 0)
4363 .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4364 .collect();
4365 return write_ioslices(writer, &slices);
4366 }
4367
4368 const COMPACT_BUF: usize = 256 * 1024;
4371 let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4372 for chunk in data.chunks(COMPACT_BUF) {
4373 let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
4374 if out_pos > 0 {
4375 writer.write_all(&outbuf[..out_pos])?;
4376 }
4377 }
4378 Ok(())
4379}
4380
4381fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
4386 let sample_size = data.len().min(1024);
4388 let sample_deletes = data[..sample_size]
4389 .iter()
4390 .filter(|&&b| b >= lo && b <= hi)
4391 .count();
4392 let estimated_deletes = if sample_size > 0 {
4398 data.len() * sample_deletes / sample_size
4399 } else {
4400 data.len()
4401 };
4402 if estimated_deletes < MAX_IOV / 2 {
4403 return delete_range_mmap_zerocopy(data, writer, lo, hi);
4404 }
4405
4406 if data.len() >= PARALLEL_THRESHOLD && data.len() <= SINGLE_ALLOC_LIMIT {
4409 let n_threads = rayon::current_num_threads().max(1);
4410 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4411
4412 let mut outbuf = alloc_uninit_vec(data.len());
4413 let chunk_lens: Vec<usize> = data
4414 .par_chunks(chunk_size)
4415 .zip(outbuf.par_chunks_mut(chunk_size))
4416 .map(|(src_chunk, dst_chunk)| delete_range_chunk(src_chunk, dst_chunk, lo, hi))
4417 .collect();
4418
4419 let slices: Vec<std::io::IoSlice> = chunk_lens
4422 .iter()
4423 .enumerate()
4424 .filter(|&(_, &len)| len > 0)
4425 .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4426 .collect();
4427 return write_ioslices(writer, &slices);
4428 }
4429
4430 const CHUNK: usize = 256 * 1024;
4433 let mut outbuf = alloc_uninit_vec(CHUNK);
4434 for chunk in data.chunks(CHUNK) {
4435 let kept = delete_range_chunk(chunk, &mut outbuf[..chunk.len()], lo, hi);
4436 writer.write_all(&outbuf[..kept])?;
4437 }
4438 Ok(())
4439}
4440
4441fn delete_range_mmap_zerocopy(
4446 data: &[u8],
4447 writer: &mut impl Write,
4448 lo: u8,
4449 hi: u8,
4450) -> io::Result<()> {
4451 #[cfg(target_arch = "x86_64")]
4452 {
4453 if get_simd_level() >= 3 {
4454 return unsafe { delete_range_zerocopy_avx2(data, writer, lo, hi) };
4455 }
4456 if get_simd_level() >= 2 {
4457 return unsafe { delete_range_zerocopy_sse2(data, writer, lo, hi) };
4458 }
4459 }
4460
4461 #[cfg(target_arch = "aarch64")]
4462 {
4463 return unsafe { delete_range_zerocopy_neon(data, writer, lo, hi) };
4464 }
4465
4466 #[allow(unreachable_code)]
4468 delete_range_zerocopy_scalar(data, writer, lo, hi)
4469}
4470
4471fn delete_range_zerocopy_scalar(
4474 data: &[u8],
4475 writer: &mut impl Write,
4476 lo: u8,
4477 hi: u8,
4478) -> io::Result<()> {
4479 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4480 let len = data.len();
4481 let mut run_start: usize = 0;
4482 let mut i: usize = 0;
4483
4484 while i < len {
4485 let b = unsafe { *data.get_unchecked(i) };
4486 if b >= lo && b <= hi {
4487 if i > run_start {
4488 iov.push(std::io::IoSlice::new(&data[run_start..i]));
4489 if iov.len() >= MAX_IOV {
4490 write_ioslices(writer, &iov)?;
4491 iov.clear();
4492 }
4493 }
4494 run_start = i + 1;
4495 }
4496 i += 1;
4497 }
4498 if run_start < len {
4499 iov.push(std::io::IoSlice::new(&data[run_start..]));
4500 }
4501 if !iov.is_empty() {
4502 write_ioslices(writer, &iov)?;
4503 }
4504 Ok(())
4505}
4506
4507#[cfg(target_arch = "x86_64")]
4511#[target_feature(enable = "avx2")]
4512unsafe fn delete_range_zerocopy_avx2(
4513 data: &[u8],
4514 writer: &mut impl Write,
4515 lo: u8,
4516 hi: u8,
4517) -> io::Result<()> {
4518 use std::arch::x86_64::*;
4519
4520 unsafe {
4521 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4522 let len = data.len();
4523 let mut run_start: usize = 0;
4524 let mut ri: usize = 0;
4525
4526 let range = hi - lo;
4527 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4528 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
4529 let zero = _mm256_setzero_si256();
4530
4531 while ri + 32 <= len {
4532 let input = _mm256_loadu_si256(data.as_ptr().add(ri) as *const _);
4533 let biased = _mm256_add_epi8(input, bias_v);
4534 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
4535 let in_range = _mm256_cmpeq_epi8(gt, zero);
4536 let del_mask = _mm256_movemask_epi8(in_range) as u32;
4537
4538 if del_mask == 0 {
4539 ri += 32;
4541 continue;
4542 }
4543
4544 let mut m = del_mask;
4546 while m != 0 {
4547 let bit = m.trailing_zeros() as usize;
4548 let abs_pos = ri + bit;
4549 if abs_pos > run_start {
4550 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4551 if iov.len() >= MAX_IOV {
4552 write_ioslices(writer, &iov)?;
4553 iov.clear();
4554 }
4555 }
4556 run_start = abs_pos + 1;
4557 m &= m - 1; }
4559
4560 ri += 32;
4561 }
4562
4563 while ri < len {
4565 let b = *data.get_unchecked(ri);
4566 if b >= lo && b <= hi {
4567 if ri > run_start {
4568 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4569 if iov.len() >= MAX_IOV {
4570 write_ioslices(writer, &iov)?;
4571 iov.clear();
4572 }
4573 }
4574 run_start = ri + 1;
4575 }
4576 ri += 1;
4577 }
4578
4579 if run_start < len {
4580 iov.push(std::io::IoSlice::new(&data[run_start..]));
4581 }
4582 if !iov.is_empty() {
4583 write_ioslices(writer, &iov)?;
4584 }
4585 Ok(())
4586 }
4587}
4588
4589#[cfg(target_arch = "x86_64")]
4591#[target_feature(enable = "sse2")]
4592unsafe fn delete_range_zerocopy_sse2(
4593 data: &[u8],
4594 writer: &mut impl Write,
4595 lo: u8,
4596 hi: u8,
4597) -> io::Result<()> {
4598 use std::arch::x86_64::*;
4599
4600 unsafe {
4601 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4602 let len = data.len();
4603 let mut run_start: usize = 0;
4604 let mut ri: usize = 0;
4605
4606 let range = hi - lo;
4607 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4608 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
4609 let zero = _mm_setzero_si128();
4610
4611 while ri + 16 <= len {
4612 let input = _mm_loadu_si128(data.as_ptr().add(ri) as *const _);
4613 let biased = _mm_add_epi8(input, bias_v);
4614 let gt = _mm_cmpgt_epi8(biased, threshold_v);
4615 let in_range = _mm_cmpeq_epi8(gt, zero);
4616 let del_mask = _mm_movemask_epi8(in_range) as u32 & 0xFFFF;
4617
4618 if del_mask == 0 {
4619 ri += 16;
4620 continue;
4621 }
4622
4623 let mut m = del_mask;
4624 while m != 0 {
4625 let bit = m.trailing_zeros() as usize;
4626 let abs_pos = ri + bit;
4627 if abs_pos > run_start {
4628 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4629 if iov.len() >= MAX_IOV {
4630 write_ioslices(writer, &iov)?;
4631 iov.clear();
4632 }
4633 }
4634 run_start = abs_pos + 1;
4635 m &= m - 1;
4636 }
4637
4638 ri += 16;
4639 }
4640
4641 while ri < len {
4642 let b = *data.get_unchecked(ri);
4643 if b >= lo && b <= hi {
4644 if ri > run_start {
4645 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4646 if iov.len() >= MAX_IOV {
4647 write_ioslices(writer, &iov)?;
4648 iov.clear();
4649 }
4650 }
4651 run_start = ri + 1;
4652 }
4653 ri += 1;
4654 }
4655
4656 if run_start < len {
4657 iov.push(std::io::IoSlice::new(&data[run_start..]));
4658 }
4659 if !iov.is_empty() {
4660 write_ioslices(writer, &iov)?;
4661 }
4662 Ok(())
4663 }
4664}
4665
4666#[cfg(target_arch = "aarch64")]
4670#[target_feature(enable = "neon")]
4671unsafe fn delete_range_zerocopy_neon(
4672 data: &[u8],
4673 writer: &mut impl Write,
4674 lo: u8,
4675 hi: u8,
4676) -> io::Result<()> {
4677 use std::arch::aarch64::*;
4678
4679 unsafe {
4680 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4681 let len = data.len();
4682 let mut run_start: usize = 0;
4683 let mut ri: usize = 0;
4684
4685 let lo_v = vdupq_n_u8(lo);
4686 let hi_v = vdupq_n_u8(hi);
4687 let bit_mask: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
4689 let bit_mask_v = vld1q_u8(bit_mask.as_ptr());
4690
4691 while ri + 16 <= len {
4692 let input = vld1q_u8(data.as_ptr().add(ri));
4693 let ge_lo = vcgeq_u8(input, lo_v);
4695 let le_hi = vcleq_u8(input, hi_v);
4696 let in_range = vandq_u8(ge_lo, le_hi);
4697
4698 let bits = vandq_u8(in_range, bit_mask_v);
4700 let pair = vpaddlq_u8(bits); let quad = vpaddlq_u16(pair); let octet = vpaddlq_u32(quad); let mask_lo = vgetq_lane_u64::<0>(octet) as u8;
4704 let mask_hi = vgetq_lane_u64::<1>(octet) as u8;
4705 let del_mask = (mask_hi as u16) << 8 | mask_lo as u16;
4706
4707 if del_mask == 0 {
4708 ri += 16;
4710 continue;
4711 }
4712
4713 let mut m = del_mask;
4715 while m != 0 {
4716 let bit = m.trailing_zeros() as usize;
4717 let abs_pos = ri + bit;
4718 if abs_pos > run_start {
4719 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4720 if iov.len() >= MAX_IOV {
4721 write_ioslices(writer, &iov)?;
4722 iov.clear();
4723 }
4724 }
4725 run_start = abs_pos + 1;
4726 m &= m - 1;
4727 }
4728
4729 ri += 16;
4730 }
4731
4732 while ri < len {
4734 let b = *data.get_unchecked(ri);
4735 if b >= lo && b <= hi {
4736 if ri > run_start {
4737 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4738 if iov.len() >= MAX_IOV {
4739 write_ioslices(writer, &iov)?;
4740 iov.clear();
4741 }
4742 }
4743 run_start = ri + 1;
4744 }
4745 ri += 1;
4746 }
4747
4748 if run_start < len {
4749 iov.push(std::io::IoSlice::new(&data[run_start..]));
4750 }
4751 if !iov.is_empty() {
4752 write_ioslices(writer, &iov)?;
4753 }
4754 Ok(())
4755 }
4756}
4757
4758#[inline]
4761fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
4762 let len = chunk.len();
4763 let mut out_pos = 0;
4764 let mut i = 0;
4765
4766 while i + 8 <= len {
4767 unsafe {
4768 let b0 = *chunk.get_unchecked(i);
4769 let b1 = *chunk.get_unchecked(i + 1);
4770 let b2 = *chunk.get_unchecked(i + 2);
4771 let b3 = *chunk.get_unchecked(i + 3);
4772 let b4 = *chunk.get_unchecked(i + 4);
4773 let b5 = *chunk.get_unchecked(i + 5);
4774 let b6 = *chunk.get_unchecked(i + 6);
4775 let b7 = *chunk.get_unchecked(i + 7);
4776
4777 *outbuf.get_unchecked_mut(out_pos) = b0;
4778 out_pos += !is_member(member, b0) as usize;
4779 *outbuf.get_unchecked_mut(out_pos) = b1;
4780 out_pos += !is_member(member, b1) as usize;
4781 *outbuf.get_unchecked_mut(out_pos) = b2;
4782 out_pos += !is_member(member, b2) as usize;
4783 *outbuf.get_unchecked_mut(out_pos) = b3;
4784 out_pos += !is_member(member, b3) as usize;
4785 *outbuf.get_unchecked_mut(out_pos) = b4;
4786 out_pos += !is_member(member, b4) as usize;
4787 *outbuf.get_unchecked_mut(out_pos) = b5;
4788 out_pos += !is_member(member, b5) as usize;
4789 *outbuf.get_unchecked_mut(out_pos) = b6;
4790 out_pos += !is_member(member, b6) as usize;
4791 *outbuf.get_unchecked_mut(out_pos) = b7;
4792 out_pos += !is_member(member, b7) as usize;
4793 }
4794 i += 8;
4795 }
4796
4797 while i < len {
4798 unsafe {
4799 let b = *chunk.get_unchecked(i);
4800 *outbuf.get_unchecked_mut(out_pos) = b;
4801 out_pos += !is_member(member, b) as usize;
4802 }
4803 i += 1;
4804 }
4805
4806 out_pos
4807}
4808
4809fn delete_bitset_zerocopy(
4814 data: &[u8],
4815 member: &[u8; 32],
4816 writer: &mut impl Write,
4817) -> io::Result<()> {
4818 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4819 let len = data.len();
4820 let mut i = 0;
4821 let mut run_start: Option<usize> = None;
4822
4823 while i < len {
4824 let b = unsafe { *data.get_unchecked(i) };
4825 if is_member(member, b) {
4826 if let Some(rs) = run_start {
4828 iov.push(std::io::IoSlice::new(&data[rs..i]));
4829 run_start = None;
4830 if iov.len() >= MAX_IOV {
4831 write_ioslices(writer, &iov)?;
4832 iov.clear();
4833 }
4834 }
4835 } else {
4836 if run_start.is_none() {
4838 run_start = Some(i);
4839 }
4840 }
4841 i += 1;
4842 }
4843 if let Some(rs) = run_start {
4845 iov.push(std::io::IoSlice::new(&data[rs..]));
4846 }
4847 if !iov.is_empty() {
4848 write_ioslices(writer, &iov)?;
4849 }
4850 Ok(())
4851}
4852
4853fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4854 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4858 let mut last = 0;
4859 for pos in memchr::memchr_iter(ch, data) {
4860 if pos > last {
4861 iov.push(std::io::IoSlice::new(&data[last..pos]));
4862 if iov.len() >= MAX_IOV {
4863 write_ioslices(writer, &iov)?;
4864 iov.clear();
4865 }
4866 }
4867 last = pos + 1;
4868 }
4869 if last < data.len() {
4870 iov.push(std::io::IoSlice::new(&data[last..]));
4871 }
4872 if !iov.is_empty() {
4873 write_ioslices(writer, &iov)?;
4874 }
4875 Ok(())
4876}
4877
4878fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4879 let c0 = chars[0];
4880 let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
4881 let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
4882 let is_three = chars.len() >= 3;
4883
4884 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4886 let mut last = 0;
4887
4888 macro_rules! process_pos {
4889 ($pos:expr) => {
4890 if $pos > last {
4891 iov.push(std::io::IoSlice::new(&data[last..$pos]));
4892 if iov.len() >= MAX_IOV {
4893 write_ioslices(writer, &iov)?;
4894 iov.clear();
4895 }
4896 }
4897 last = $pos + 1;
4898 };
4899 }
4900
4901 if is_three {
4902 for pos in memchr::memchr3_iter(c0, c1, c2, data) {
4903 process_pos!(pos);
4904 }
4905 } else {
4906 for pos in memchr::memchr2_iter(c0, c1, data) {
4907 process_pos!(pos);
4908 }
4909 }
4910 if last < data.len() {
4911 iov.push(std::io::IoSlice::new(&data[last..]));
4912 }
4913 if !iov.is_empty() {
4914 write_ioslices(writer, &iov)?;
4915 }
4916 Ok(())
4917}
4918
4919pub fn delete_squeeze_mmap(
4924 delete_chars: &[u8],
4925 squeeze_chars: &[u8],
4926 data: &[u8],
4927 writer: &mut impl Write,
4928) -> io::Result<()> {
4929 let delete_set = build_member_set(delete_chars);
4930 let squeeze_set = build_member_set(squeeze_chars);
4931
4932 if data.len() <= SINGLE_ALLOC_LIMIT {
4933 let mut outbuf = alloc_uninit_vec(data.len());
4935 let mut last_squeezed: u16 = 256;
4936 let mut out_pos = 0;
4937
4938 for &b in data.iter() {
4939 if is_member(&delete_set, b) {
4940 continue;
4941 }
4942 if is_member(&squeeze_set, b) {
4943 if last_squeezed == b as u16 {
4944 continue;
4945 }
4946 last_squeezed = b as u16;
4947 } else {
4948 last_squeezed = 256;
4949 }
4950 unsafe {
4951 *outbuf.get_unchecked_mut(out_pos) = b;
4952 }
4953 out_pos += 1;
4954 }
4955 return writer.write_all(&outbuf[..out_pos]);
4956 }
4957
4958 const CHUNK: usize = 256 * 1024;
4960 let mut outbuf = alloc_uninit_vec(CHUNK);
4961 let mut last_squeezed: u16 = 256;
4962 for chunk in data.chunks(CHUNK) {
4963 let mut out_pos = 0;
4964 for &b in chunk.iter() {
4965 if is_member(&delete_set, b) {
4966 continue;
4967 }
4968 if is_member(&squeeze_set, b) {
4969 if last_squeezed == b as u16 {
4970 continue;
4971 }
4972 last_squeezed = b as u16;
4973 } else {
4974 last_squeezed = 256;
4975 }
4976 outbuf[out_pos] = b;
4977 out_pos += 1;
4978 }
4979 writer.write_all(&outbuf[..out_pos])?;
4980 }
4981 Ok(())
4982}
4983
4984pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4990 if squeeze_chars.len() == 1 {
4991 return squeeze_single_mmap(squeeze_chars[0], data, writer);
4992 }
4993 if squeeze_chars.len() == 2 {
4994 return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
4995 }
4996 if squeeze_chars.len() == 3 {
4997 return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
4998 }
4999
5000 let member = build_member_set(squeeze_chars);
5001
5002 if data.len() >= PARALLEL_THRESHOLD && data.len() <= SINGLE_ALLOC_LIMIT {
5004 let n_threads = rayon::current_num_threads().max(1);
5005 let chunk_size = (data.len() / n_threads).max(32 * 1024);
5006
5007 let results: Vec<Vec<u8>> = data
5008 .par_chunks(chunk_size)
5009 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
5010 .collect();
5011
5012 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
5017 for (idx, result) in results.iter().enumerate() {
5018 if result.is_empty() {
5019 continue;
5020 }
5021 if idx > 0 {
5022 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
5024 if is_member(&member, prev_last) {
5025 let skip = result.iter().take_while(|&&b| b == prev_last).count();
5027 if skip < result.len() {
5028 slices.push(std::io::IoSlice::new(&result[skip..]));
5029 }
5030 continue;
5031 }
5032 }
5033 }
5034 slices.push(std::io::IoSlice::new(result));
5035 }
5036 return write_ioslices(writer, &slices);
5037 }
5038
5039 if data.len() <= SINGLE_ALLOC_LIMIT {
5040 let mut outbuf = alloc_uninit_vec(data.len());
5042 let len = data.len();
5043 let mut wp = 0;
5044 let mut i = 0;
5045 let mut last_squeezed: u16 = 256;
5046
5047 unsafe {
5048 let inp = data.as_ptr();
5049 let outp = outbuf.as_mut_ptr();
5050
5051 while i < len {
5052 let b = *inp.add(i);
5053 if is_member(&member, b) {
5054 if last_squeezed != b as u16 {
5055 *outp.add(wp) = b;
5056 wp += 1;
5057 last_squeezed = b as u16;
5058 }
5059 i += 1;
5060 while i < len && *inp.add(i) == b {
5061 i += 1;
5062 }
5063 } else {
5064 last_squeezed = 256;
5065 *outp.add(wp) = b;
5066 wp += 1;
5067 i += 1;
5068 }
5069 }
5070 }
5071 return writer.write_all(&outbuf[..wp]);
5072 }
5073
5074 const CHUNK: usize = 256 * 1024;
5076 let mut outbuf = alloc_uninit_vec(CHUNK);
5077 let mut last_squeezed: u16 = 256;
5078 for chunk in data.chunks(CHUNK) {
5079 let mut wp = 0;
5080 for &b in chunk.iter() {
5081 if is_member(&member, b) {
5082 if last_squeezed != b as u16 {
5083 outbuf[wp] = b;
5084 wp += 1;
5085 last_squeezed = b as u16;
5086 }
5087 } else {
5088 last_squeezed = 256;
5089 outbuf[wp] = b;
5090 wp += 1;
5091 }
5092 }
5093 writer.write_all(&outbuf[..wp])?;
5094 }
5095 Ok(())
5096}
5097
5098fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
5100 let len = chunk.len();
5101 let mut out = Vec::with_capacity(len);
5102 let mut last_squeezed: u16 = 256;
5103 let mut i = 0;
5104
5105 unsafe {
5106 out.set_len(len);
5107 let inp = chunk.as_ptr();
5108 let outp: *mut u8 = out.as_mut_ptr();
5109 let mut wp = 0;
5110
5111 while i < len {
5112 let b = *inp.add(i);
5113 if is_member(member, b) {
5114 if last_squeezed != b as u16 {
5115 *outp.add(wp) = b;
5116 wp += 1;
5117 last_squeezed = b as u16;
5118 }
5119 i += 1;
5120 while i < len && *inp.add(i) == b {
5121 i += 1;
5122 }
5123 } else {
5124 last_squeezed = 256;
5125 *outp.add(wp) = b;
5126 wp += 1;
5127 i += 1;
5128 }
5129 }
5130 out.set_len(wp);
5131 }
5132 out
5133}
5134
5135fn squeeze_multi_mmap<const N: usize>(
5136 chars: &[u8],
5137 data: &[u8],
5138 writer: &mut impl Write,
5139) -> io::Result<()> {
5140 if data.len() >= PARALLEL_THRESHOLD {
5142 let member = build_member_set(chars);
5143 let n_threads = rayon::current_num_threads().max(1);
5144 let chunk_size = (data.len() / n_threads).max(32 * 1024);
5145
5146 let results: Vec<Vec<u8>> = data
5147 .par_chunks(chunk_size)
5148 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
5149 .collect();
5150
5151 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
5153 for (idx, result) in results.iter().enumerate() {
5154 if result.is_empty() {
5155 continue;
5156 }
5157 if idx > 0 {
5158 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
5159 if is_member(&member, prev_last) {
5160 let skip = result.iter().take_while(|&&b| b == prev_last).count();
5161 if skip < result.len() {
5162 slices.push(std::io::IoSlice::new(&result[skip..]));
5163 }
5164 continue;
5165 }
5166 }
5167 }
5168 slices.push(std::io::IoSlice::new(result));
5169 }
5170 return write_ioslices(writer, &slices);
5171 }
5172
5173 let single = [chars[0]; 1]; let _ = single;
5179 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(1024);
5180 let mut cursor = 0;
5181 let mut last_squeezed: u16 = 256;
5182
5183 macro_rules! find_next {
5184 ($data:expr) => {
5185 if N == 2 {
5186 memchr::memchr2(chars[0], chars[1], $data)
5187 } else {
5188 memchr::memchr3(chars[0], chars[1], chars[2], $data)
5189 }
5190 };
5191 }
5192
5193 while cursor < data.len() {
5194 match find_next!(&data[cursor..]) {
5195 Some(offset) => {
5196 let pos = cursor + offset;
5197 let b = data[pos];
5198 if pos > cursor {
5200 iov.push(std::io::IoSlice::new(&data[cursor..pos]));
5201 last_squeezed = 256;
5202 }
5203 if last_squeezed != b as u16 {
5205 iov.push(std::io::IoSlice::new(&data[pos..pos + 1]));
5207 last_squeezed = b as u16;
5208 }
5209 let mut skip = pos + 1;
5211 while skip < data.len() && data[skip] == b {
5212 skip += 1;
5213 }
5214 cursor = skip;
5215 if iov.len() >= MAX_IOV {
5217 write_ioslices(writer, &iov)?;
5218 iov.clear();
5219 }
5220 }
5221 None => {
5222 if cursor < data.len() {
5223 iov.push(std::io::IoSlice::new(&data[cursor..]));
5224 }
5225 break;
5226 }
5227 }
5228 }
5229 if !iov.is_empty() {
5230 write_ioslices(writer, &iov)?;
5231 }
5232 Ok(())
5233}
5234
5235fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
5236 if data.is_empty() {
5237 return Ok(());
5238 }
5239
5240 let pair = [ch, ch];
5242 if memchr::memmem::find(data, &pair).is_none() {
5243 return writer.write_all(data);
5244 }
5245
5246 let finder = memchr::memmem::Finder::new(&pair);
5253 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(2048);
5254 let mut cursor = 0;
5255
5256 while cursor < data.len() {
5257 match finder.find(&data[cursor..]) {
5258 Some(offset) => {
5259 let pair_pos = cursor + offset;
5260 let seg_end = pair_pos + 1;
5262 if seg_end > cursor {
5263 iov.push(std::io::IoSlice::new(&data[cursor..seg_end]));
5264 }
5265 let mut skip = seg_end;
5267 while skip < data.len() && data[skip] == ch {
5268 skip += 1;
5269 }
5270 cursor = skip;
5271 if iov.len() >= MAX_IOV {
5273 write_ioslices(writer, &iov)?;
5274 iov.clear();
5275 }
5276 }
5277 None => {
5278 if cursor < data.len() {
5280 iov.push(std::io::IoSlice::new(&data[cursor..]));
5281 }
5282 break;
5283 }
5284 }
5285 }
5286
5287 if !iov.is_empty() {
5288 write_ioslices(writer, &iov)?;
5289 }
5290 Ok(())
5291}