1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const MAX_IOV: usize = 1024;
8
9const STREAM_BUF: usize = 32 * 1024 * 1024;
17
18const PARALLEL_THRESHOLD: usize = 32 * 1024 * 1024;
24
25#[cfg(target_arch = "x86_64")]
31static COMPACT_LUT: [[u8; 8]; 256] = {
32 let mut lut = [[0u8; 8]; 256];
33 let mut mask: u16 = 0;
34 while mask < 256 {
35 let mut idx: usize = 0;
36 let mut bit: u8 = 0;
37 while bit < 8 {
38 if (mask >> bit) & 1 != 0 {
39 lut[mask as usize][idx] = bit;
40 idx += 1;
41 }
42 bit += 1;
43 }
44 mask += 1;
45 }
46 lut
47};
48
49#[inline]
52fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
53 if slices.is_empty() {
54 return Ok(());
55 }
56 for batch in slices.chunks(MAX_IOV) {
57 let total: usize = batch.iter().map(|s| s.len()).sum();
58 match writer.write_vectored(batch) {
59 Ok(n) if n >= total => continue,
60 Ok(mut written) => {
61 for slice in batch {
63 let slen = slice.len();
64 if written >= slen {
65 written -= slen;
66 continue;
67 }
68 if written > 0 {
69 writer.write_all(&slice[written..])?;
70 written = 0;
71 } else {
72 writer.write_all(slice)?;
73 }
74 }
75 }
76 Err(e) => return Err(e),
77 }
78 }
79 Ok(())
80}
81
82#[inline]
86#[allow(clippy::uninit_vec)]
87fn alloc_uninit_vec(len: usize) -> Vec<u8> {
88 let mut v = Vec::with_capacity(len);
89 unsafe {
91 v.set_len(len);
92 }
93 #[cfg(target_os = "linux")]
94 if len >= 2 * 1024 * 1024 {
95 unsafe {
96 libc::madvise(
97 v.as_mut_ptr() as *mut libc::c_void,
98 len,
99 libc::MADV_HUGEPAGE,
100 );
101 }
102 }
103 v
104}
105
106#[inline]
108fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
109 let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
110 let last = set2.last().copied();
111 for (i, &from) in set1.iter().enumerate() {
112 table[from as usize] = if i < set2.len() {
113 set2[i]
114 } else {
115 last.unwrap_or(from)
116 };
117 }
118 table
119}
120
121#[inline]
123fn build_member_set(chars: &[u8]) -> [u8; 32] {
124 let mut set = [0u8; 32];
125 for &ch in chars {
126 set[ch as usize >> 3] |= 1 << (ch & 7);
127 }
128 set
129}
130
131#[inline(always)]
132fn is_member(set: &[u8; 32], ch: u8) -> bool {
133 unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
134}
135
136#[cfg(target_arch = "x86_64")]
139static SIMD_LEVEL: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
140
141#[cfg(target_arch = "x86_64")]
142#[inline(always)]
143fn get_simd_level() -> u8 {
144 let level = SIMD_LEVEL.load(std::sync::atomic::Ordering::Relaxed);
145 if level != 0 {
146 return level;
147 }
148 let detected = if is_x86_feature_detected!("avx2") {
149 3
150 } else if is_x86_feature_detected!("ssse3") {
151 2
152 } else {
153 1
154 };
155 SIMD_LEVEL.store(detected, std::sync::atomic::Ordering::Relaxed);
156 detected
157}
158
159#[cfg(target_arch = "x86_64")]
161#[inline]
162fn count_non_identity(table: &[u8; 256]) -> usize {
163 table
164 .iter()
165 .enumerate()
166 .filter(|&(i, &v)| v != i as u8)
167 .count()
168}
169
170#[inline(always)]
176fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
177 #[cfg(target_arch = "x86_64")]
178 {
179 let level = get_simd_level();
180 if level >= 3 {
181 let non_id = count_non_identity(table);
186 if non_id > 0 && non_id <= 16 {
187 unsafe { translate_inplace_avx2_sparse(data, table) };
188 return;
189 }
190 unsafe { translate_inplace_avx2_table(data, table) };
191 return;
192 }
193 if level >= 2 {
194 unsafe { translate_inplace_ssse3_table(data, table) };
195 return;
196 }
197 }
198 translate_inplace_scalar(data, table);
199}
200
201#[cfg(target_arch = "x86_64")]
207#[target_feature(enable = "avx2")]
208unsafe fn translate_inplace_avx2_sparse(data: &mut [u8], table: &[u8; 256]) {
209 use std::arch::x86_64::*;
210
211 unsafe {
212 let len = data.len();
213 let ptr = data.as_mut_ptr();
214
215 let mut lut = [_mm256_setzero_si256(); 16];
217 for h in 0u8..16 {
218 let base = (h as usize) * 16;
219 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
220 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
221 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
222 }
223
224 let lo_mask = _mm256_set1_epi8(0x0F);
225
226 let mut i = 0;
227 while i + 32 <= len {
228 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
229 let lo_nibble = _mm256_and_si256(input, lo_mask);
230 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
231
232 let mut result = _mm256_setzero_si256();
233 macro_rules! do_nibble {
234 ($h:expr) => {
235 let h_val = _mm256_set1_epi8($h as i8);
236 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
237 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
238 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
239 };
240 }
241 do_nibble!(0);
242 do_nibble!(1);
243 do_nibble!(2);
244 do_nibble!(3);
245 do_nibble!(4);
246 do_nibble!(5);
247 do_nibble!(6);
248 do_nibble!(7);
249 do_nibble!(8);
250 do_nibble!(9);
251 do_nibble!(10);
252 do_nibble!(11);
253 do_nibble!(12);
254 do_nibble!(13);
255 do_nibble!(14);
256 do_nibble!(15);
257
258 let diff = _mm256_xor_si256(input, result);
260 if _mm256_testz_si256(diff, diff) == 0 {
261 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
262 }
263 i += 32;
264 }
265
266 while i < len {
268 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
269 i += 1;
270 }
271 }
272}
273
274#[cfg(not(target_arch = "aarch64"))]
276#[inline(always)]
277fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
278 let len = data.len();
279 let ptr = data.as_mut_ptr();
280 let mut i = 0;
281 unsafe {
282 while i + 8 <= len {
283 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
284 *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
285 *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
286 *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
287 *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
288 *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
289 *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
290 *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
291 i += 8;
292 }
293 while i < len {
294 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
295 i += 1;
296 }
297 }
298}
299
300#[cfg(target_arch = "aarch64")]
303#[inline(always)]
304fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
305 unsafe { translate_inplace_neon_table(data, table) };
306}
307
308#[cfg(target_arch = "aarch64")]
309#[target_feature(enable = "neon")]
310unsafe fn translate_inplace_neon_table(data: &mut [u8], table: &[u8; 256]) {
311 use std::arch::aarch64::*;
312
313 unsafe {
314 let len = data.len();
315 let ptr = data.as_mut_ptr();
316
317 let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
319 for h in 0u8..16 {
320 let base = (h as usize) * 16;
321 lut[h as usize] = vld1q_u8(table.as_ptr().add(base));
322 }
323
324 let lo_mask = vdupq_n_u8(0x0F);
325 let mut i = 0;
326
327 while i + 16 <= len {
328 let input = vld1q_u8(ptr.add(i));
329 let lo_nibble = vandq_u8(input, lo_mask);
330 let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
331
332 let mut result = vdupq_n_u8(0);
333 macro_rules! do_nibble {
334 ($h:expr) => {
335 let h_val = vdupq_n_u8($h);
336 let mask = vceqq_u8(hi_nibble, h_val);
337 let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
338 result = vorrq_u8(result, vandq_u8(mask, looked_up));
339 };
340 }
341 do_nibble!(0);
342 do_nibble!(1);
343 do_nibble!(2);
344 do_nibble!(3);
345 do_nibble!(4);
346 do_nibble!(5);
347 do_nibble!(6);
348 do_nibble!(7);
349 do_nibble!(8);
350 do_nibble!(9);
351 do_nibble!(10);
352 do_nibble!(11);
353 do_nibble!(12);
354 do_nibble!(13);
355 do_nibble!(14);
356 do_nibble!(15);
357
358 vst1q_u8(ptr.add(i), result);
359 i += 16;
360 }
361
362 while i < len {
364 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
365 i += 1;
366 }
367 }
368}
369
370#[cfg(target_arch = "x86_64")]
390#[target_feature(enable = "avx2")]
391unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
392 use std::arch::x86_64::*;
393
394 unsafe {
395 let len = data.len();
396 let ptr = data.as_mut_ptr();
397
398 let mut lut = [_mm256_setzero_si256(); 16];
402 for h in 0u8..16 {
403 let base = (h as usize) * 16;
404 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
405 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
407 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
408 }
409
410 let lo_mask = _mm256_set1_epi8(0x0F);
411
412 let mut i = 0;
413
414 while i + 64 <= len {
418 let input0 = _mm256_loadu_si256(ptr.add(i) as *const _);
419 let input1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
420
421 let lo0 = _mm256_and_si256(input0, lo_mask);
422 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
423 let lo1 = _mm256_and_si256(input1, lo_mask);
424 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
425
426 let mut r0 = _mm256_setzero_si256();
427 let mut r1 = _mm256_setzero_si256();
428
429 macro_rules! do_nibble2 {
430 ($h:expr) => {
431 let h_val = _mm256_set1_epi8($h as i8);
432 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
433 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
434 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
435 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
436 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
437 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
438 };
439 }
440 do_nibble2!(0);
441 do_nibble2!(1);
442 do_nibble2!(2);
443 do_nibble2!(3);
444 do_nibble2!(4);
445 do_nibble2!(5);
446 do_nibble2!(6);
447 do_nibble2!(7);
448 do_nibble2!(8);
449 do_nibble2!(9);
450 do_nibble2!(10);
451 do_nibble2!(11);
452 do_nibble2!(12);
453 do_nibble2!(13);
454 do_nibble2!(14);
455 do_nibble2!(15);
456
457 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
458 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
459 i += 64;
460 }
461
462 if i + 32 <= len {
464 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
465 let lo_nibble = _mm256_and_si256(input, lo_mask);
466 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
467
468 let mut result = _mm256_setzero_si256();
469
470 macro_rules! do_nibble {
471 ($h:expr) => {
472 let h_val = _mm256_set1_epi8($h as i8);
473 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
474 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
475 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
476 };
477 }
478 do_nibble!(0);
479 do_nibble!(1);
480 do_nibble!(2);
481 do_nibble!(3);
482 do_nibble!(4);
483 do_nibble!(5);
484 do_nibble!(6);
485 do_nibble!(7);
486 do_nibble!(8);
487 do_nibble!(9);
488 do_nibble!(10);
489 do_nibble!(11);
490 do_nibble!(12);
491 do_nibble!(13);
492 do_nibble!(14);
493 do_nibble!(15);
494
495 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
496 i += 32;
497 }
498
499 if i + 16 <= len {
501 let lo_mask128 = _mm_set1_epi8(0x0F);
502
503 let mut lut128 = [_mm_setzero_si128(); 16];
504 for h in 0u8..16 {
505 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
506 }
507
508 let input = _mm_loadu_si128(ptr.add(i) as *const _);
509 let lo_nib = _mm_and_si128(input, lo_mask128);
510 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
511
512 let mut res = _mm_setzero_si128();
513 macro_rules! do_nibble128 {
514 ($h:expr) => {
515 let h_val = _mm_set1_epi8($h as i8);
516 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
517 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
518 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
519 };
520 }
521 do_nibble128!(0);
522 do_nibble128!(1);
523 do_nibble128!(2);
524 do_nibble128!(3);
525 do_nibble128!(4);
526 do_nibble128!(5);
527 do_nibble128!(6);
528 do_nibble128!(7);
529 do_nibble128!(8);
530 do_nibble128!(9);
531 do_nibble128!(10);
532 do_nibble128!(11);
533 do_nibble128!(12);
534 do_nibble128!(13);
535 do_nibble128!(14);
536 do_nibble128!(15);
537
538 _mm_storeu_si128(ptr.add(i) as *mut _, res);
539 i += 16;
540 }
541
542 while i < len {
544 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
545 i += 1;
546 }
547 }
548}
549
550#[cfg(target_arch = "x86_64")]
551#[target_feature(enable = "ssse3")]
552unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
553 use std::arch::x86_64::*;
554
555 unsafe {
556 let len = data.len();
557 let ptr = data.as_mut_ptr();
558
559 let mut lut = [_mm_setzero_si128(); 16];
561 for h in 0u8..16 {
562 let base = (h as usize) * 16;
563 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
564 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
565 }
566
567 let lo_mask = _mm_set1_epi8(0x0F);
568
569 let mut i = 0;
570 while i + 16 <= len {
571 let input = _mm_loadu_si128(ptr.add(i) as *const _);
572 let lo_nibble = _mm_and_si128(input, lo_mask);
573 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
574
575 let mut result = _mm_setzero_si128();
576
577 macro_rules! do_nibble {
578 ($h:expr) => {
579 let h_val = _mm_set1_epi8($h as i8);
580 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
581 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
582 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
583 };
584 }
585 do_nibble!(0);
586 do_nibble!(1);
587 do_nibble!(2);
588 do_nibble!(3);
589 do_nibble!(4);
590 do_nibble!(5);
591 do_nibble!(6);
592 do_nibble!(7);
593 do_nibble!(8);
594 do_nibble!(9);
595 do_nibble!(10);
596 do_nibble!(11);
597 do_nibble!(12);
598 do_nibble!(13);
599 do_nibble!(14);
600 do_nibble!(15);
601
602 _mm_storeu_si128(ptr.add(i) as *mut _, result);
603 i += 16;
604 }
605
606 while i < len {
608 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
609 i += 1;
610 }
611 }
612}
613
614#[inline(always)]
617fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
618 debug_assert!(dst.len() >= src.len());
619 #[cfg(target_arch = "x86_64")]
620 {
621 let level = get_simd_level();
622 if level >= 3 {
623 if dst.as_ptr() as usize & 31 == 0 {
625 unsafe { translate_to_avx2_table_nt(src, dst, table) };
626 } else {
627 unsafe { translate_to_avx2_table(src, dst, table) };
628 }
629 return;
630 }
631 if level >= 2 {
632 unsafe { translate_to_ssse3_table(src, dst, table) };
633 return;
634 }
635 }
636 translate_to_scalar(src, dst, table);
637}
638
639#[cfg(not(target_arch = "aarch64"))]
641#[inline(always)]
642fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
643 unsafe {
644 let sp = src.as_ptr();
645 let dp = dst.as_mut_ptr();
646 let len = src.len();
647 let mut i = 0;
648 while i + 8 <= len {
649 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
650 *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
651 *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
652 *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
653 *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
654 *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
655 *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
656 *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
657 i += 8;
658 }
659 while i < len {
660 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
661 i += 1;
662 }
663 }
664}
665
666#[cfg(target_arch = "aarch64")]
668#[inline(always)]
669fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
670 unsafe { translate_to_neon_table(src, dst, table) };
671}
672
673#[cfg(target_arch = "aarch64")]
674#[target_feature(enable = "neon")]
675unsafe fn translate_to_neon_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
676 use std::arch::aarch64::*;
677
678 unsafe {
679 let len = src.len();
680 let sp = src.as_ptr();
681 let dp = dst.as_mut_ptr();
682
683 let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
684 for h in 0u8..16 {
685 lut[h as usize] = vld1q_u8(table.as_ptr().add((h as usize) * 16));
686 }
687
688 let lo_mask = vdupq_n_u8(0x0F);
689 let mut i = 0;
690
691 while i + 16 <= len {
692 let input = vld1q_u8(sp.add(i));
693 let lo_nibble = vandq_u8(input, lo_mask);
694 let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
695
696 let mut result = vdupq_n_u8(0);
697 macro_rules! do_nibble {
698 ($h:expr) => {
699 let h_val = vdupq_n_u8($h);
700 let mask = vceqq_u8(hi_nibble, h_val);
701 let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
702 result = vorrq_u8(result, vandq_u8(mask, looked_up));
703 };
704 }
705 do_nibble!(0);
706 do_nibble!(1);
707 do_nibble!(2);
708 do_nibble!(3);
709 do_nibble!(4);
710 do_nibble!(5);
711 do_nibble!(6);
712 do_nibble!(7);
713 do_nibble!(8);
714 do_nibble!(9);
715 do_nibble!(10);
716 do_nibble!(11);
717 do_nibble!(12);
718 do_nibble!(13);
719 do_nibble!(14);
720 do_nibble!(15);
721
722 vst1q_u8(dp.add(i), result);
723 i += 16;
724 }
725
726 while i < len {
727 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
728 i += 1;
729 }
730 }
731}
732
733#[cfg(target_arch = "x86_64")]
734#[target_feature(enable = "avx2")]
735unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
736 use std::arch::x86_64::*;
737
738 unsafe {
739 let len = src.len();
740 let sp = src.as_ptr();
741 let dp = dst.as_mut_ptr();
742
743 let mut lut = [_mm256_setzero_si256(); 16];
745 for h in 0u8..16 {
746 let base = (h as usize) * 16;
747 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
748 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
749 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
750 }
751
752 let lo_mask = _mm256_set1_epi8(0x0F);
753
754 let mut i = 0;
755
756 while i + 64 <= len {
758 let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
759 let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
760
761 let lo0 = _mm256_and_si256(input0, lo_mask);
762 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
763 let lo1 = _mm256_and_si256(input1, lo_mask);
764 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
765
766 let mut r0 = _mm256_setzero_si256();
767 let mut r1 = _mm256_setzero_si256();
768
769 macro_rules! do_nibble2 {
770 ($h:expr) => {
771 let h_val = _mm256_set1_epi8($h as i8);
772 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
773 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
774 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
775 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
776 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
777 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
778 };
779 }
780 do_nibble2!(0);
781 do_nibble2!(1);
782 do_nibble2!(2);
783 do_nibble2!(3);
784 do_nibble2!(4);
785 do_nibble2!(5);
786 do_nibble2!(6);
787 do_nibble2!(7);
788 do_nibble2!(8);
789 do_nibble2!(9);
790 do_nibble2!(10);
791 do_nibble2!(11);
792 do_nibble2!(12);
793 do_nibble2!(13);
794 do_nibble2!(14);
795 do_nibble2!(15);
796
797 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
798 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
799 i += 64;
800 }
801
802 if i + 32 <= len {
804 let input = _mm256_loadu_si256(sp.add(i) as *const _);
805 let lo_nibble = _mm256_and_si256(input, lo_mask);
806 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
807
808 let mut result = _mm256_setzero_si256();
809
810 macro_rules! do_nibble {
811 ($h:expr) => {
812 let h_val = _mm256_set1_epi8($h as i8);
813 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
814 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
815 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
816 };
817 }
818 do_nibble!(0);
819 do_nibble!(1);
820 do_nibble!(2);
821 do_nibble!(3);
822 do_nibble!(4);
823 do_nibble!(5);
824 do_nibble!(6);
825 do_nibble!(7);
826 do_nibble!(8);
827 do_nibble!(9);
828 do_nibble!(10);
829 do_nibble!(11);
830 do_nibble!(12);
831 do_nibble!(13);
832 do_nibble!(14);
833 do_nibble!(15);
834
835 _mm256_storeu_si256(dp.add(i) as *mut _, result);
836 i += 32;
837 }
838
839 if i + 16 <= len {
841 let lo_mask128 = _mm_set1_epi8(0x0F);
842 let mut lut128 = [_mm_setzero_si128(); 16];
843 for h in 0u8..16 {
844 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
845 }
846
847 let input = _mm_loadu_si128(sp.add(i) as *const _);
848 let lo_nib = _mm_and_si128(input, lo_mask128);
849 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
850
851 let mut res = _mm_setzero_si128();
852 macro_rules! do_nibble128 {
853 ($h:expr) => {
854 let h_val = _mm_set1_epi8($h as i8);
855 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
856 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
857 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
858 };
859 }
860 do_nibble128!(0);
861 do_nibble128!(1);
862 do_nibble128!(2);
863 do_nibble128!(3);
864 do_nibble128!(4);
865 do_nibble128!(5);
866 do_nibble128!(6);
867 do_nibble128!(7);
868 do_nibble128!(8);
869 do_nibble128!(9);
870 do_nibble128!(10);
871 do_nibble128!(11);
872 do_nibble128!(12);
873 do_nibble128!(13);
874 do_nibble128!(14);
875 do_nibble128!(15);
876
877 _mm_storeu_si128(dp.add(i) as *mut _, res);
878 i += 16;
879 }
880
881 while i < len {
883 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
884 i += 1;
885 }
886 }
887}
888
889#[cfg(target_arch = "x86_64")]
892#[target_feature(enable = "avx2")]
893unsafe fn translate_to_avx2_table_nt(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
894 use std::arch::x86_64::*;
895
896 unsafe {
897 let len = src.len();
898 let sp = src.as_ptr();
899 let dp = dst.as_mut_ptr();
900
901 let mut lut = [_mm256_setzero_si256(); 16];
903 for h in 0u8..16 {
904 let base = (h as usize) * 16;
905 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
906 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
907 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
908 }
909
910 let lo_mask = _mm256_set1_epi8(0x0F);
911 let mut i = 0;
912
913 while i + 64 <= len {
915 let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
916 let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
917
918 let lo0 = _mm256_and_si256(input0, lo_mask);
919 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
920 let lo1 = _mm256_and_si256(input1, lo_mask);
921 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
922
923 let mut r0 = _mm256_setzero_si256();
924 let mut r1 = _mm256_setzero_si256();
925
926 macro_rules! do_nibble2 {
927 ($h:expr) => {
928 let h_val = _mm256_set1_epi8($h as i8);
929 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
930 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
931 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
932 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
933 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
934 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
935 };
936 }
937 do_nibble2!(0);
938 do_nibble2!(1);
939 do_nibble2!(2);
940 do_nibble2!(3);
941 do_nibble2!(4);
942 do_nibble2!(5);
943 do_nibble2!(6);
944 do_nibble2!(7);
945 do_nibble2!(8);
946 do_nibble2!(9);
947 do_nibble2!(10);
948 do_nibble2!(11);
949 do_nibble2!(12);
950 do_nibble2!(13);
951 do_nibble2!(14);
952 do_nibble2!(15);
953
954 _mm256_stream_si256(dp.add(i) as *mut _, r0);
955 _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
956 i += 64;
957 }
958
959 if i + 32 <= len {
961 let input = _mm256_loadu_si256(sp.add(i) as *const _);
962 let lo_nibble = _mm256_and_si256(input, lo_mask);
963 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
964
965 let mut result = _mm256_setzero_si256();
966 macro_rules! do_nibble {
967 ($h:expr) => {
968 let h_val = _mm256_set1_epi8($h as i8);
969 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
970 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
971 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
972 };
973 }
974 do_nibble!(0);
975 do_nibble!(1);
976 do_nibble!(2);
977 do_nibble!(3);
978 do_nibble!(4);
979 do_nibble!(5);
980 do_nibble!(6);
981 do_nibble!(7);
982 do_nibble!(8);
983 do_nibble!(9);
984 do_nibble!(10);
985 do_nibble!(11);
986 do_nibble!(12);
987 do_nibble!(13);
988 do_nibble!(14);
989 do_nibble!(15);
990
991 _mm256_stream_si256(dp.add(i) as *mut _, result);
992 i += 32;
993 }
994
995 if i + 16 <= len {
997 let lo_mask128 = _mm_set1_epi8(0x0F);
998 let mut lut128 = [_mm_setzero_si128(); 16];
999 for h in 0u8..16 {
1000 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
1001 }
1002
1003 let input = _mm_loadu_si128(sp.add(i) as *const _);
1004 let lo_nib = _mm_and_si128(input, lo_mask128);
1005 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
1006
1007 let mut res = _mm_setzero_si128();
1008 macro_rules! do_nibble128 {
1009 ($h:expr) => {
1010 let h_val = _mm_set1_epi8($h as i8);
1011 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
1012 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
1013 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
1014 };
1015 }
1016 do_nibble128!(0);
1017 do_nibble128!(1);
1018 do_nibble128!(2);
1019 do_nibble128!(3);
1020 do_nibble128!(4);
1021 do_nibble128!(5);
1022 do_nibble128!(6);
1023 do_nibble128!(7);
1024 do_nibble128!(8);
1025 do_nibble128!(9);
1026 do_nibble128!(10);
1027 do_nibble128!(11);
1028 do_nibble128!(12);
1029 do_nibble128!(13);
1030 do_nibble128!(14);
1031 do_nibble128!(15);
1032
1033 _mm_storeu_si128(dp.add(i) as *mut _, res);
1034 i += 16;
1035 }
1036
1037 while i < len {
1039 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1040 i += 1;
1041 }
1042
1043 _mm_sfence();
1045 }
1046}
1047
1048#[cfg(target_arch = "x86_64")]
1049#[target_feature(enable = "ssse3")]
1050unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
1051 use std::arch::x86_64::*;
1052
1053 unsafe {
1054 let len = src.len();
1055 let sp = src.as_ptr();
1056 let dp = dst.as_mut_ptr();
1057
1058 let mut lut = [_mm_setzero_si128(); 16];
1059 for h in 0u8..16 {
1060 let base = (h as usize) * 16;
1061 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
1062 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
1063 }
1064
1065 let lo_mask = _mm_set1_epi8(0x0F);
1066
1067 let mut i = 0;
1068 while i + 16 <= len {
1069 let input = _mm_loadu_si128(sp.add(i) as *const _);
1070 let lo_nibble = _mm_and_si128(input, lo_mask);
1071 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
1072
1073 let mut result = _mm_setzero_si128();
1074
1075 macro_rules! do_nibble {
1076 ($h:expr) => {
1077 let h_val = _mm_set1_epi8($h as i8);
1078 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
1079 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
1080 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
1081 };
1082 }
1083 do_nibble!(0);
1084 do_nibble!(1);
1085 do_nibble!(2);
1086 do_nibble!(3);
1087 do_nibble!(4);
1088 do_nibble!(5);
1089 do_nibble!(6);
1090 do_nibble!(7);
1091 do_nibble!(8);
1092 do_nibble!(9);
1093 do_nibble!(10);
1094 do_nibble!(11);
1095 do_nibble!(12);
1096 do_nibble!(13);
1097 do_nibble!(14);
1098 do_nibble!(15);
1099
1100 _mm_storeu_si128(dp.add(i) as *mut _, result);
1101 i += 16;
1102 }
1103
1104 while i < len {
1106 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1107 i += 1;
1108 }
1109 }
1110}
1111
1112#[inline]
1120fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
1121 let mut lo: Option<u8> = None;
1122 let mut hi = 0u8;
1123 let mut offset = 0i16;
1124
1125 for i in 0..256 {
1126 if table[i] != i as u8 {
1127 let diff = table[i] as i16 - i as i16;
1128 match lo {
1129 None => {
1130 lo = Some(i as u8);
1131 hi = i as u8;
1132 offset = diff;
1133 }
1134 Some(_) => {
1135 if diff != offset || i as u8 != hi.wrapping_add(1) {
1136 return None;
1137 }
1138 hi = i as u8;
1139 }
1140 }
1141 }
1142 }
1143
1144 lo.map(|l| (l, hi, offset as i8))
1145}
1146
1147#[inline]
1152fn detect_range_to_constant(table: &[u8; 256]) -> Option<(u8, u8, u8)> {
1153 let mut lo: Option<u8> = None;
1154 let mut hi = 0u8;
1155 let mut replacement = 0u8;
1156
1157 for i in 0..256 {
1158 if table[i] != i as u8 {
1159 match lo {
1160 None => {
1161 lo = Some(i as u8);
1162 hi = i as u8;
1163 replacement = table[i];
1164 }
1165 Some(_) => {
1166 if table[i] != replacement || i as u8 != hi.wrapping_add(1) {
1167 return None;
1168 }
1169 hi = i as u8;
1170 }
1171 }
1172 }
1173 }
1174
1175 lo.map(|l| (l, hi, replacement))
1176}
1177
1178#[cfg(target_arch = "x86_64")]
1183fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1184 if get_simd_level() >= 3 {
1185 unsafe { translate_range_to_constant_avx2_inplace(data, lo, hi, replacement) };
1186 } else {
1187 unsafe { translate_range_to_constant_sse2_inplace(data, lo, hi, replacement) };
1188 }
1189}
1190
1191#[cfg(target_arch = "x86_64")]
1192#[target_feature(enable = "avx2")]
1193unsafe fn translate_range_to_constant_avx2_inplace(
1194 data: &mut [u8],
1195 lo: u8,
1196 hi: u8,
1197 replacement: u8,
1198) {
1199 use std::arch::x86_64::*;
1200
1201 unsafe {
1202 let range = hi - lo;
1203 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1204 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1205 let repl_v = _mm256_set1_epi8(replacement as i8);
1206 let zero = _mm256_setzero_si256();
1207
1208 let len = data.len();
1209 let ptr = data.as_mut_ptr();
1210 let mut i = 0;
1211
1212 while i + 64 <= len {
1214 let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1215 let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1216 let bi0 = _mm256_add_epi8(in0, bias_v);
1217 let bi1 = _mm256_add_epi8(in1, bias_v);
1218 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1219 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1220 let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1221 let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1222 let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1223 let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1224 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1225 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1226 i += 64;
1227 }
1228
1229 if i + 32 <= len {
1231 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1232 let biased = _mm256_add_epi8(input, bias_v);
1233 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1234 let in_range = _mm256_cmpeq_epi8(gt, zero);
1235 let result = _mm256_blendv_epi8(input, repl_v, in_range);
1236 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1237 i += 32;
1238 }
1239
1240 if i + 16 <= len {
1241 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1242 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1243 let repl_v128 = _mm_set1_epi8(replacement as i8);
1244 let zero128 = _mm_setzero_si128();
1245
1246 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1247 let biased = _mm_add_epi8(input, bias_v128);
1248 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1249 let in_range = _mm_cmpeq_epi8(gt, zero128);
1250 let result = _mm_blendv_epi8(input, repl_v128, in_range);
1251 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1252 i += 16;
1253 }
1254
1255 while i < len {
1256 let b = *ptr.add(i);
1257 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1258 i += 1;
1259 }
1260 }
1261}
1262
1263#[cfg(target_arch = "x86_64")]
1264#[target_feature(enable = "sse2")]
1265unsafe fn translate_range_to_constant_sse2_inplace(
1266 data: &mut [u8],
1267 lo: u8,
1268 hi: u8,
1269 replacement: u8,
1270) {
1271 use std::arch::x86_64::*;
1272
1273 unsafe {
1274 let range = hi - lo;
1275 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1276 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1277 let repl_v = _mm_set1_epi8(replacement as i8);
1278 let zero = _mm_setzero_si128();
1279
1280 let len = data.len();
1281 let ptr = data.as_mut_ptr();
1282 let mut i = 0;
1283
1284 while i + 16 <= len {
1285 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1286 let biased = _mm_add_epi8(input, bias_v);
1287 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1288 let in_range = _mm_cmpeq_epi8(gt, zero);
1290 let result = _mm_or_si128(
1292 _mm_and_si128(in_range, repl_v),
1293 _mm_andnot_si128(in_range, input),
1294 );
1295 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1296 i += 16;
1297 }
1298
1299 while i < len {
1300 let b = *ptr.add(i);
1301 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1302 i += 1;
1303 }
1304 }
1305}
1306
1307#[cfg(target_arch = "aarch64")]
1308fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1309 unsafe { translate_range_to_constant_neon_inplace(data, lo, hi, replacement) };
1310}
1311
1312#[cfg(target_arch = "aarch64")]
1313#[target_feature(enable = "neon")]
1314unsafe fn translate_range_to_constant_neon_inplace(
1315 data: &mut [u8],
1316 lo: u8,
1317 hi: u8,
1318 replacement: u8,
1319) {
1320 use std::arch::aarch64::*;
1321
1322 unsafe {
1323 let len = data.len();
1324 let ptr = data.as_mut_ptr();
1325 let lo_v = vdupq_n_u8(lo);
1326 let hi_v = vdupq_n_u8(hi);
1327 let repl_v = vdupq_n_u8(replacement);
1328 let mut i = 0;
1329
1330 while i + 32 <= len {
1331 let in0 = vld1q_u8(ptr.add(i));
1332 let in1 = vld1q_u8(ptr.add(i + 16));
1333 let ge0 = vcgeq_u8(in0, lo_v);
1334 let le0 = vcleq_u8(in0, hi_v);
1335 let mask0 = vandq_u8(ge0, le0);
1336 let ge1 = vcgeq_u8(in1, lo_v);
1337 let le1 = vcleq_u8(in1, hi_v);
1338 let mask1 = vandq_u8(ge1, le1);
1339 vst1q_u8(ptr.add(i), vbslq_u8(mask0, repl_v, in0));
1341 vst1q_u8(ptr.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1342 i += 32;
1343 }
1344
1345 if i + 16 <= len {
1346 let input = vld1q_u8(ptr.add(i));
1347 let ge = vcgeq_u8(input, lo_v);
1348 let le = vcleq_u8(input, hi_v);
1349 let mask = vandq_u8(ge, le);
1350 vst1q_u8(ptr.add(i), vbslq_u8(mask, repl_v, input));
1351 i += 16;
1352 }
1353
1354 while i < len {
1355 let b = *ptr.add(i);
1356 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1357 i += 1;
1358 }
1359 }
1360}
1361
1362#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1363fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1364 for b in data.iter_mut() {
1365 if *b >= lo && *b <= hi {
1366 *b = replacement;
1367 }
1368 }
1369}
1370
1371#[cfg(target_arch = "x86_64")]
1374fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1375 if get_simd_level() >= 3 {
1376 unsafe { translate_range_to_constant_avx2(src, dst, lo, hi, replacement) };
1377 } else {
1378 unsafe { translate_range_to_constant_sse2(src, dst, lo, hi, replacement) };
1379 }
1380}
1381
1382#[cfg(target_arch = "aarch64")]
1383fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1384 unsafe { translate_range_to_constant_neon(src, dst, lo, hi, replacement) };
1385}
1386
1387#[cfg(target_arch = "aarch64")]
1388#[target_feature(enable = "neon")]
1389unsafe fn translate_range_to_constant_neon(
1390 src: &[u8],
1391 dst: &mut [u8],
1392 lo: u8,
1393 hi: u8,
1394 replacement: u8,
1395) {
1396 use std::arch::aarch64::*;
1397
1398 unsafe {
1399 let len = src.len();
1400 let sp = src.as_ptr();
1401 let dp = dst.as_mut_ptr();
1402 let lo_v = vdupq_n_u8(lo);
1403 let hi_v = vdupq_n_u8(hi);
1404 let repl_v = vdupq_n_u8(replacement);
1405 let mut i = 0;
1406
1407 while i + 32 <= len {
1408 let in0 = vld1q_u8(sp.add(i));
1409 let in1 = vld1q_u8(sp.add(i + 16));
1410 let mask0 = vandq_u8(vcgeq_u8(in0, lo_v), vcleq_u8(in0, hi_v));
1411 let mask1 = vandq_u8(vcgeq_u8(in1, lo_v), vcleq_u8(in1, hi_v));
1412 vst1q_u8(dp.add(i), vbslq_u8(mask0, repl_v, in0));
1413 vst1q_u8(dp.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1414 i += 32;
1415 }
1416
1417 if i + 16 <= len {
1418 let input = vld1q_u8(sp.add(i));
1419 let mask = vandq_u8(vcgeq_u8(input, lo_v), vcleq_u8(input, hi_v));
1420 vst1q_u8(dp.add(i), vbslq_u8(mask, repl_v, input));
1421 i += 16;
1422 }
1423
1424 while i < len {
1425 let b = *sp.add(i);
1426 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1427 i += 1;
1428 }
1429 }
1430}
1431
1432#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1433fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1434 for (i, &b) in src.iter().enumerate() {
1435 unsafe {
1436 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi { replacement } else { b };
1437 }
1438 }
1439}
1440
1441#[cfg(target_arch = "x86_64")]
1442#[target_feature(enable = "avx2")]
1443unsafe fn translate_range_to_constant_avx2(
1444 src: &[u8],
1445 dst: &mut [u8],
1446 lo: u8,
1447 hi: u8,
1448 replacement: u8,
1449) {
1450 use std::arch::x86_64::*;
1451 unsafe {
1452 let range = hi - lo;
1453 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1454 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1455 let repl_v = _mm256_set1_epi8(replacement as i8);
1456 let zero = _mm256_setzero_si256();
1457 let len = src.len();
1458 let sp = src.as_ptr();
1459 let dp = dst.as_mut_ptr();
1460 let mut i = 0;
1461 while i + 64 <= len {
1462 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1463 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1464 let bi0 = _mm256_add_epi8(in0, bias_v);
1465 let bi1 = _mm256_add_epi8(in1, bias_v);
1466 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1467 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1468 let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1469 let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1470 let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1471 let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1472 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1473 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1474 i += 64;
1475 }
1476 if i + 32 <= len {
1477 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1478 let biased = _mm256_add_epi8(input, bias_v);
1479 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1480 let in_range = _mm256_cmpeq_epi8(gt, zero);
1481 let result = _mm256_blendv_epi8(input, repl_v, in_range);
1482 _mm256_storeu_si256(dp.add(i) as *mut _, result);
1483 i += 32;
1484 }
1485 while i < len {
1486 let b = *sp.add(i);
1487 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1488 i += 1;
1489 }
1490 }
1491}
1492
1493#[cfg(target_arch = "x86_64")]
1494#[target_feature(enable = "sse2")]
1495unsafe fn translate_range_to_constant_sse2(
1496 src: &[u8],
1497 dst: &mut [u8],
1498 lo: u8,
1499 hi: u8,
1500 replacement: u8,
1501) {
1502 use std::arch::x86_64::*;
1503 unsafe {
1504 let range = hi - lo;
1505 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1506 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1507 let repl_v = _mm_set1_epi8(replacement as i8);
1508 let zero = _mm_setzero_si128();
1509 let len = src.len();
1510 let sp = src.as_ptr();
1511 let dp = dst.as_mut_ptr();
1512 let mut i = 0;
1513 while i + 16 <= len {
1514 let input = _mm_loadu_si128(sp.add(i) as *const _);
1515 let biased = _mm_add_epi8(input, bias_v);
1516 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1517 let in_range = _mm_cmpeq_epi8(gt, zero);
1518 let result = _mm_or_si128(
1519 _mm_and_si128(in_range, repl_v),
1520 _mm_andnot_si128(in_range, input),
1521 );
1522 _mm_storeu_si128(dp.add(i) as *mut _, result);
1523 i += 16;
1524 }
1525 while i < len {
1526 let b = *sp.add(i);
1527 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1528 i += 1;
1529 }
1530 }
1531}
1532
1533#[cfg(target_arch = "x86_64")]
1540fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1541 if get_simd_level() >= 3 {
1542 if dst.as_ptr() as usize & 31 == 0 {
1544 unsafe { translate_range_avx2_nt(src, dst, lo, hi, offset) };
1545 } else {
1546 unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
1547 }
1548 } else {
1549 unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
1550 }
1551}
1552
1553#[cfg(target_arch = "x86_64")]
1554#[target_feature(enable = "avx2")]
1555unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1556 use std::arch::x86_64::*;
1557
1558 unsafe {
1559 let range = hi - lo;
1560 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1565 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1566 let offset_v = _mm256_set1_epi8(offset);
1567 let zero = _mm256_setzero_si256();
1568
1569 let len = src.len();
1570 let sp = src.as_ptr();
1571 let dp = dst.as_mut_ptr();
1572 let mut i = 0;
1573
1574 while i + 64 <= len {
1577 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1578 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1579 let bi0 = _mm256_add_epi8(in0, bias_v);
1580 let bi1 = _mm256_add_epi8(in1, bias_v);
1581 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1582 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1583 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1584 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1585 let om0 = _mm256_and_si256(m0, offset_v);
1586 let om1 = _mm256_and_si256(m1, offset_v);
1587 let r0 = _mm256_add_epi8(in0, om0);
1588 let r1 = _mm256_add_epi8(in1, om1);
1589 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1590 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1591 i += 64;
1592 }
1593
1594 if i + 32 <= len {
1596 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1597 let biased = _mm256_add_epi8(input, bias_v);
1598 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1599 let mask = _mm256_cmpeq_epi8(gt, zero);
1600 let offset_masked = _mm256_and_si256(mask, offset_v);
1601 let result = _mm256_add_epi8(input, offset_masked);
1602 _mm256_storeu_si256(dp.add(i) as *mut _, result);
1603 i += 32;
1604 }
1605
1606 if i + 16 <= len {
1608 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1609 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1610 let offset_v128 = _mm_set1_epi8(offset);
1611 let zero128 = _mm_setzero_si128();
1612
1613 let input = _mm_loadu_si128(sp.add(i) as *const _);
1614 let biased = _mm_add_epi8(input, bias_v128);
1615 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1616 let mask = _mm_cmpeq_epi8(gt, zero128);
1617 let offset_masked = _mm_and_si128(mask, offset_v128);
1618 let result = _mm_add_epi8(input, offset_masked);
1619 _mm_storeu_si128(dp.add(i) as *mut _, result);
1620 i += 16;
1621 }
1622
1623 while i < len {
1625 let b = *sp.add(i);
1626 *dp.add(i) = if b >= lo && b <= hi {
1627 b.wrapping_add(offset as u8)
1628 } else {
1629 b
1630 };
1631 i += 1;
1632 }
1633 }
1634}
1635
1636#[cfg(target_arch = "x86_64")]
1642#[target_feature(enable = "avx2")]
1643unsafe fn translate_range_avx2_nt(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1644 use std::arch::x86_64::*;
1645
1646 unsafe {
1647 let range = hi - lo;
1648 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1649 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1650 let offset_v = _mm256_set1_epi8(offset);
1651 let zero = _mm256_setzero_si256();
1652
1653 let len = src.len();
1654 let sp = src.as_ptr();
1655 let dp = dst.as_mut_ptr();
1656 let mut i = 0;
1657
1658 while i + 64 <= len {
1660 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1661 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1662 let bi0 = _mm256_add_epi8(in0, bias_v);
1663 let bi1 = _mm256_add_epi8(in1, bias_v);
1664 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1665 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1666 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1667 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1668 let om0 = _mm256_and_si256(m0, offset_v);
1669 let om1 = _mm256_and_si256(m1, offset_v);
1670 let r0 = _mm256_add_epi8(in0, om0);
1671 let r1 = _mm256_add_epi8(in1, om1);
1672 _mm256_stream_si256(dp.add(i) as *mut _, r0);
1673 _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
1674 i += 64;
1675 }
1676
1677 if i + 32 <= len {
1679 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1680 let biased = _mm256_add_epi8(input, bias_v);
1681 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1682 let mask = _mm256_cmpeq_epi8(gt, zero);
1683 let offset_masked = _mm256_and_si256(mask, offset_v);
1684 let result = _mm256_add_epi8(input, offset_masked);
1685 _mm256_stream_si256(dp.add(i) as *mut _, result);
1686 i += 32;
1687 }
1688
1689 if i + 16 <= len {
1691 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1692 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1693 let offset_v128 = _mm_set1_epi8(offset);
1694 let zero128 = _mm_setzero_si128();
1695
1696 let input = _mm_loadu_si128(sp.add(i) as *const _);
1697 let biased = _mm_add_epi8(input, bias_v128);
1698 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1699 let mask = _mm_cmpeq_epi8(gt, zero128);
1700 let offset_masked = _mm_and_si128(mask, offset_v128);
1701 let result = _mm_add_epi8(input, offset_masked);
1702 _mm_storeu_si128(dp.add(i) as *mut _, result);
1703 i += 16;
1704 }
1705
1706 while i < len {
1708 let b = *sp.add(i);
1709 *dp.add(i) = if b >= lo && b <= hi {
1710 b.wrapping_add(offset as u8)
1711 } else {
1712 b
1713 };
1714 i += 1;
1715 }
1716
1717 _mm_sfence();
1719 }
1720}
1721
1722#[cfg(target_arch = "x86_64")]
1723#[target_feature(enable = "sse2")]
1724unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1725 use std::arch::x86_64::*;
1726
1727 unsafe {
1728 let range = hi - lo;
1729 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1730 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1731 let offset_v = _mm_set1_epi8(offset);
1732 let zero = _mm_setzero_si128();
1733
1734 let len = src.len();
1735 let mut i = 0;
1736
1737 while i + 16 <= len {
1738 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
1739 let biased = _mm_add_epi8(input, bias_v);
1740 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1741 let mask = _mm_cmpeq_epi8(gt, zero);
1742 let offset_masked = _mm_and_si128(mask, offset_v);
1743 let result = _mm_add_epi8(input, offset_masked);
1744 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
1745 i += 16;
1746 }
1747
1748 while i < len {
1749 let b = *src.get_unchecked(i);
1750 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
1751 b.wrapping_add(offset as u8)
1752 } else {
1753 b
1754 };
1755 i += 1;
1756 }
1757 }
1758}
1759
1760#[cfg(target_arch = "aarch64")]
1763fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1764 unsafe { translate_range_neon(src, dst, lo, hi, offset) };
1765}
1766
1767#[cfg(target_arch = "aarch64")]
1768#[target_feature(enable = "neon")]
1769unsafe fn translate_range_neon(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1770 use std::arch::aarch64::*;
1771
1772 unsafe {
1773 let len = src.len();
1774 let sp = src.as_ptr();
1775 let dp = dst.as_mut_ptr();
1776 let lo_v = vdupq_n_u8(lo);
1777 let hi_v = vdupq_n_u8(hi);
1778 let offset_v = vdupq_n_s8(offset);
1779 let mut i = 0;
1780
1781 while i + 32 <= len {
1783 let in0 = vld1q_u8(sp.add(i));
1784 let in1 = vld1q_u8(sp.add(i + 16));
1785 let ge0 = vcgeq_u8(in0, lo_v);
1787 let le0 = vcleq_u8(in0, hi_v);
1788 let mask0 = vandq_u8(ge0, le0);
1789 let ge1 = vcgeq_u8(in1, lo_v);
1790 let le1 = vcleq_u8(in1, hi_v);
1791 let mask1 = vandq_u8(ge1, le1);
1792 let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
1794 let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
1795 let r0 = vaddq_u8(in0, off0);
1796 let r1 = vaddq_u8(in1, off1);
1797 vst1q_u8(dp.add(i), r0);
1798 vst1q_u8(dp.add(i + 16), r1);
1799 i += 32;
1800 }
1801
1802 if i + 16 <= len {
1803 let input = vld1q_u8(sp.add(i));
1804 let ge = vcgeq_u8(input, lo_v);
1805 let le = vcleq_u8(input, hi_v);
1806 let mask = vandq_u8(ge, le);
1807 let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
1808 vst1q_u8(dp.add(i), vaddq_u8(input, off));
1809 i += 16;
1810 }
1811
1812 while i < len {
1813 let b = *sp.add(i);
1814 *dp.add(i) = if b >= lo && b <= hi {
1815 b.wrapping_add(offset as u8)
1816 } else {
1817 b
1818 };
1819 i += 1;
1820 }
1821 }
1822}
1823
1824#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1826fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1827 let offset_u8 = offset as u8;
1828 let range = hi.wrapping_sub(lo);
1829 unsafe {
1830 let sp = src.as_ptr();
1831 let dp = dst.as_mut_ptr();
1832 let len = src.len();
1833 let mut i = 0;
1834 while i + 8 <= len {
1835 macro_rules! do_byte {
1836 ($off:expr) => {{
1837 let b = *sp.add(i + $off);
1838 let in_range = b.wrapping_sub(lo) <= range;
1839 *dp.add(i + $off) = if in_range {
1840 b.wrapping_add(offset_u8)
1841 } else {
1842 b
1843 };
1844 }};
1845 }
1846 do_byte!(0);
1847 do_byte!(1);
1848 do_byte!(2);
1849 do_byte!(3);
1850 do_byte!(4);
1851 do_byte!(5);
1852 do_byte!(6);
1853 do_byte!(7);
1854 i += 8;
1855 }
1856 while i < len {
1857 let b = *sp.add(i);
1858 let in_range = b.wrapping_sub(lo) <= range;
1859 *dp.add(i) = if in_range {
1860 b.wrapping_add(offset_u8)
1861 } else {
1862 b
1863 };
1864 i += 1;
1865 }
1866 }
1867}
1868
1869#[cfg(target_arch = "x86_64")]
1877fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1878 if get_simd_level() >= 3 {
1879 unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
1880 } else {
1881 unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
1882 }
1883}
1884
1885#[cfg(target_arch = "x86_64")]
1886#[target_feature(enable = "avx2")]
1887unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1888 use std::arch::x86_64::*;
1889
1890 unsafe {
1891 let range = hi - lo;
1892 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1893 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1894 let offset_v = _mm256_set1_epi8(offset);
1895 let zero = _mm256_setzero_si256();
1896
1897 let len = data.len();
1898 let ptr = data.as_mut_ptr();
1899 let mut i = 0;
1900
1901 while i + 64 <= len {
1903 let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1904 let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1905 let bi0 = _mm256_add_epi8(in0, bias_v);
1906 let bi1 = _mm256_add_epi8(in1, bias_v);
1907 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1908 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1909 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1910 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1911 let om0 = _mm256_and_si256(m0, offset_v);
1912 let om1 = _mm256_and_si256(m1, offset_v);
1913 let r0 = _mm256_add_epi8(in0, om0);
1914 let r1 = _mm256_add_epi8(in1, om1);
1915 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1916 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1917 i += 64;
1918 }
1919
1920 if i + 32 <= len {
1922 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1923 let biased = _mm256_add_epi8(input, bias_v);
1924 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1925 let mask = _mm256_cmpeq_epi8(gt, zero);
1926 let offset_masked = _mm256_and_si256(mask, offset_v);
1927 let result = _mm256_add_epi8(input, offset_masked);
1928 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1929 i += 32;
1930 }
1931
1932 if i + 16 <= len {
1933 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1934 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1935 let offset_v128 = _mm_set1_epi8(offset);
1936 let zero128 = _mm_setzero_si128();
1937
1938 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1939 let biased = _mm_add_epi8(input, bias_v128);
1940 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1941 let mask = _mm_cmpeq_epi8(gt, zero128);
1942 let offset_masked = _mm_and_si128(mask, offset_v128);
1943 let result = _mm_add_epi8(input, offset_masked);
1944 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1945 i += 16;
1946 }
1947
1948 while i < len {
1949 let b = *ptr.add(i);
1950 *ptr.add(i) = if b >= lo && b <= hi {
1951 b.wrapping_add(offset as u8)
1952 } else {
1953 b
1954 };
1955 i += 1;
1956 }
1957 }
1958}
1959
1960#[cfg(target_arch = "x86_64")]
1961#[target_feature(enable = "sse2")]
1962unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1963 use std::arch::x86_64::*;
1964
1965 unsafe {
1966 let range = hi - lo;
1967 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1968 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1969 let offset_v = _mm_set1_epi8(offset);
1970 let zero = _mm_setzero_si128();
1971
1972 let len = data.len();
1973 let ptr = data.as_mut_ptr();
1974 let mut i = 0;
1975
1976 while i + 16 <= len {
1977 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1978 let biased = _mm_add_epi8(input, bias_v);
1979 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1980 let mask = _mm_cmpeq_epi8(gt, zero);
1981 let offset_masked = _mm_and_si128(mask, offset_v);
1982 let result = _mm_add_epi8(input, offset_masked);
1983 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1984 i += 16;
1985 }
1986
1987 while i < len {
1988 let b = *ptr.add(i);
1989 *ptr.add(i) = if b >= lo && b <= hi {
1990 b.wrapping_add(offset as u8)
1991 } else {
1992 b
1993 };
1994 i += 1;
1995 }
1996 }
1997}
1998
1999#[cfg(target_arch = "aarch64")]
2000fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2001 unsafe { translate_range_neon_inplace(data, lo, hi, offset) };
2002}
2003
2004#[cfg(target_arch = "aarch64")]
2005#[target_feature(enable = "neon")]
2006unsafe fn translate_range_neon_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2007 use std::arch::aarch64::*;
2008
2009 unsafe {
2010 let len = data.len();
2011 let ptr = data.as_mut_ptr();
2012 let lo_v = vdupq_n_u8(lo);
2013 let hi_v = vdupq_n_u8(hi);
2014 let offset_v = vdupq_n_s8(offset);
2015 let mut i = 0;
2016
2017 while i + 32 <= len {
2018 let in0 = vld1q_u8(ptr.add(i));
2019 let in1 = vld1q_u8(ptr.add(i + 16));
2020 let ge0 = vcgeq_u8(in0, lo_v);
2021 let le0 = vcleq_u8(in0, hi_v);
2022 let mask0 = vandq_u8(ge0, le0);
2023 let ge1 = vcgeq_u8(in1, lo_v);
2024 let le1 = vcleq_u8(in1, hi_v);
2025 let mask1 = vandq_u8(ge1, le1);
2026 let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
2027 let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
2028 vst1q_u8(ptr.add(i), vaddq_u8(in0, off0));
2029 vst1q_u8(ptr.add(i + 16), vaddq_u8(in1, off1));
2030 i += 32;
2031 }
2032
2033 if i + 16 <= len {
2034 let input = vld1q_u8(ptr.add(i));
2035 let ge = vcgeq_u8(input, lo_v);
2036 let le = vcleq_u8(input, hi_v);
2037 let mask = vandq_u8(ge, le);
2038 let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
2039 vst1q_u8(ptr.add(i), vaddq_u8(input, off));
2040 i += 16;
2041 }
2042
2043 while i < len {
2044 let b = *ptr.add(i);
2045 if b >= lo && b <= hi {
2046 *ptr.add(i) = b.wrapping_add(offset as u8);
2047 }
2048 i += 1;
2049 }
2050 }
2051}
2052
2053#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
2054fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2055 let offset_u8 = offset as u8;
2056 let range = hi.wrapping_sub(lo);
2057 for b in data.iter_mut() {
2058 if b.wrapping_sub(lo) <= range {
2059 *b = b.wrapping_add(offset_u8);
2060 }
2061 }
2062}
2063
2064#[inline]
2074fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
2075 if chars.is_empty() {
2076 return None;
2077 }
2078 let mut lo = chars[0];
2079 let mut hi = chars[0];
2080 for &c in &chars[1..] {
2081 if c < lo {
2082 lo = c;
2083 }
2084 if c > hi {
2085 hi = c;
2086 }
2087 }
2088 if (hi - lo + 1) as usize == chars.len() {
2090 Some((lo, hi))
2091 } else {
2092 None
2093 }
2094}
2095
2096#[cfg(target_arch = "x86_64")]
2100fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2101 if get_simd_level() >= 3 {
2102 unsafe { delete_range_avx2(src, dst, lo, hi) }
2103 } else {
2104 unsafe { delete_range_sse2(src, dst, lo, hi) }
2105 }
2106}
2107
2108#[cfg(target_arch = "x86_64")]
2109#[target_feature(enable = "avx2")]
2110unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2111 use std::arch::x86_64::*;
2112
2113 unsafe {
2114 let range = hi - lo;
2115 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2116 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2117 let zero = _mm256_setzero_si256();
2118
2119 let len = src.len();
2120 let sp = src.as_ptr();
2121 let dp = dst.as_mut_ptr();
2122 let mut ri = 0;
2123 let mut wp = 0;
2124
2125 while ri + 32 <= len {
2126 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
2127 let biased = _mm256_add_epi8(input, bias_v);
2128 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2130 let in_range = _mm256_cmpeq_epi8(gt, zero);
2132 let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
2134
2135 if keep_mask == 0xFFFFFFFF {
2136 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
2138 wp += 32;
2139 } else if keep_mask != 0 {
2140 let m0 = keep_mask as u8;
2145 let m1 = (keep_mask >> 8) as u8;
2146 let m2 = (keep_mask >> 16) as u8;
2147 let m3 = (keep_mask >> 24) as u8;
2148
2149 if m0 == 0xFF {
2150 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2151 } else if m0 != 0 {
2152 compact_8bytes(sp.add(ri), dp.add(wp), m0);
2153 }
2154 let c0 = m0.count_ones() as usize;
2155
2156 if m1 == 0xFF {
2157 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2158 } else if m1 != 0 {
2159 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2160 }
2161 let c1 = m1.count_ones() as usize;
2162
2163 if m2 == 0xFF {
2164 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
2165 } else if m2 != 0 {
2166 compact_8bytes(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
2167 }
2168 let c2 = m2.count_ones() as usize;
2169
2170 if m3 == 0xFF {
2171 std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
2172 } else if m3 != 0 {
2173 compact_8bytes(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
2174 }
2175 let c3 = m3.count_ones() as usize;
2176 wp += c0 + c1 + c2 + c3;
2177 }
2178 ri += 32;
2180 }
2181
2182 if ri + 16 <= len {
2184 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2185 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2186 let zero128 = _mm_setzero_si128();
2187
2188 let input = _mm_loadu_si128(sp.add(ri) as *const _);
2189 let biased = _mm_add_epi8(input, bias_v128);
2190 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
2191 let in_range = _mm_cmpeq_epi8(gt, zero128);
2192 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2193
2194 if keep_mask == 0xFFFF {
2195 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2196 wp += 16;
2197 } else if keep_mask != 0 {
2198 let m0 = keep_mask as u8;
2199 let m1 = (keep_mask >> 8) as u8;
2200 if m0 == 0xFF {
2201 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2202 } else if m0 != 0 {
2203 compact_8bytes(sp.add(ri), dp.add(wp), m0);
2204 }
2205 let c0 = m0.count_ones() as usize;
2206 if m1 == 0xFF {
2207 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2208 } else if m1 != 0 {
2209 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2210 }
2211 wp += c0 + m1.count_ones() as usize;
2212 }
2213 ri += 16;
2214 }
2215
2216 while ri < len {
2218 let b = *sp.add(ri);
2219 *dp.add(wp) = b;
2220 wp += (b < lo || b > hi) as usize;
2221 ri += 1;
2222 }
2223
2224 wp
2225 }
2226}
2227
2228#[cfg(target_arch = "x86_64")]
2236#[inline(always)]
2237unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
2238 unsafe {
2239 let idx = COMPACT_LUT.get_unchecked(mask as usize);
2240 *dst = *src.add(*idx.get_unchecked(0) as usize);
2241 *dst.add(1) = *src.add(*idx.get_unchecked(1) as usize);
2242 *dst.add(2) = *src.add(*idx.get_unchecked(2) as usize);
2243 *dst.add(3) = *src.add(*idx.get_unchecked(3) as usize);
2244 *dst.add(4) = *src.add(*idx.get_unchecked(4) as usize);
2245 *dst.add(5) = *src.add(*idx.get_unchecked(5) as usize);
2246 *dst.add(6) = *src.add(*idx.get_unchecked(6) as usize);
2247 *dst.add(7) = *src.add(*idx.get_unchecked(7) as usize);
2248 }
2249}
2250
2251#[cfg(target_arch = "x86_64")]
2252#[target_feature(enable = "sse2")]
2253unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2254 use std::arch::x86_64::*;
2255
2256 unsafe {
2257 let range = hi - lo;
2258 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2259 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2260 let zero = _mm_setzero_si128();
2261
2262 let len = src.len();
2263 let sp = src.as_ptr();
2264 let dp = dst.as_mut_ptr();
2265 let mut ri = 0;
2266 let mut wp = 0;
2267
2268 while ri + 16 <= len {
2269 let input = _mm_loadu_si128(sp.add(ri) as *const _);
2270 let biased = _mm_add_epi8(input, bias_v);
2271 let gt = _mm_cmpgt_epi8(biased, threshold_v);
2272 let in_range = _mm_cmpeq_epi8(gt, zero);
2273 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2274
2275 if keep_mask == 0xFFFF {
2276 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2278 wp += 16;
2279 } else if keep_mask != 0 {
2280 let m0 = keep_mask as u8;
2281 let m1 = (keep_mask >> 8) as u8;
2282 if m0 == 0xFF {
2283 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2284 } else if m0 != 0 {
2285 compact_8bytes(sp.add(ri), dp.add(wp), m0);
2286 }
2287 let c0 = m0.count_ones() as usize;
2288 if m1 == 0xFF {
2289 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2290 } else if m1 != 0 {
2291 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2292 }
2293 wp += c0 + m1.count_ones() as usize;
2294 }
2295 ri += 16;
2296 }
2297
2298 while ri < len {
2300 let b = *sp.add(ri);
2301 *dp.add(wp) = b;
2302 wp += (b < lo || b > hi) as usize;
2303 ri += 1;
2304 }
2305
2306 wp
2307 }
2308}
2309
2310#[cfg(not(target_arch = "x86_64"))]
2314fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2315 let len = src.len();
2316 let sp = src.as_ptr();
2317 let dp = dst.as_mut_ptr();
2318 let mut wp: usize = 0;
2319 let mut i: usize = 0;
2320
2321 while i + 8 <= len {
2323 unsafe {
2324 let b0 = *sp.add(i);
2325 *dp.add(wp) = b0;
2326 wp += (b0 < lo || b0 > hi) as usize;
2327 let b1 = *sp.add(i + 1);
2328 *dp.add(wp) = b1;
2329 wp += (b1 < lo || b1 > hi) as usize;
2330 let b2 = *sp.add(i + 2);
2331 *dp.add(wp) = b2;
2332 wp += (b2 < lo || b2 > hi) as usize;
2333 let b3 = *sp.add(i + 3);
2334 *dp.add(wp) = b3;
2335 wp += (b3 < lo || b3 > hi) as usize;
2336 let b4 = *sp.add(i + 4);
2337 *dp.add(wp) = b4;
2338 wp += (b4 < lo || b4 > hi) as usize;
2339 let b5 = *sp.add(i + 5);
2340 *dp.add(wp) = b5;
2341 wp += (b5 < lo || b5 > hi) as usize;
2342 let b6 = *sp.add(i + 6);
2343 *dp.add(wp) = b6;
2344 wp += (b6 < lo || b6 > hi) as usize;
2345 let b7 = *sp.add(i + 7);
2346 *dp.add(wp) = b7;
2347 wp += (b7 < lo || b7 > hi) as usize;
2348 }
2349 i += 8;
2350 }
2351
2352 while i < len {
2354 unsafe {
2355 let b = *sp.add(i);
2356 *dp.add(wp) = b;
2357 wp += (b < lo || b > hi) as usize;
2358 }
2359 i += 1;
2360 }
2361
2362 wp
2363}
2364
2365fn delete_range_streaming(
2370 lo: u8,
2371 hi: u8,
2372 reader: &mut impl Read,
2373 writer: &mut impl Write,
2374) -> io::Result<()> {
2375 let mut buf = alloc_uninit_vec(STREAM_BUF);
2378 loop {
2379 let n = read_once(reader, &mut buf)?;
2380 if n == 0 {
2381 break;
2382 }
2383 let wp = delete_range_inplace(&mut buf, n, lo, hi);
2384 if wp > 0 {
2385 writer.write_all(&buf[..wp])?;
2386 }
2387 }
2388 Ok(())
2389}
2390
2391#[inline]
2394fn delete_range_inplace(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2395 #[cfg(target_arch = "x86_64")]
2396 {
2397 let level = get_simd_level();
2398 if level >= 3 {
2399 return unsafe { delete_range_inplace_avx2(buf, n, lo, hi) };
2400 }
2401 }
2402 let ptr = buf.as_mut_ptr();
2404 let mut ri = 0;
2405 let mut wp = 0;
2406 unsafe {
2407 while ri + 8 <= n {
2408 let b0 = *ptr.add(ri);
2409 let b1 = *ptr.add(ri + 1);
2410 let b2 = *ptr.add(ri + 2);
2411 let b3 = *ptr.add(ri + 3);
2412 let b4 = *ptr.add(ri + 4);
2413 let b5 = *ptr.add(ri + 5);
2414 let b6 = *ptr.add(ri + 6);
2415 let b7 = *ptr.add(ri + 7);
2416 *ptr.add(wp) = b0;
2417 wp += (b0 < lo || b0 > hi) as usize;
2418 *ptr.add(wp) = b1;
2419 wp += (b1 < lo || b1 > hi) as usize;
2420 *ptr.add(wp) = b2;
2421 wp += (b2 < lo || b2 > hi) as usize;
2422 *ptr.add(wp) = b3;
2423 wp += (b3 < lo || b3 > hi) as usize;
2424 *ptr.add(wp) = b4;
2425 wp += (b4 < lo || b4 > hi) as usize;
2426 *ptr.add(wp) = b5;
2427 wp += (b5 < lo || b5 > hi) as usize;
2428 *ptr.add(wp) = b6;
2429 wp += (b6 < lo || b6 > hi) as usize;
2430 *ptr.add(wp) = b7;
2431 wp += (b7 < lo || b7 > hi) as usize;
2432 ri += 8;
2433 }
2434 while ri < n {
2435 let b = *ptr.add(ri);
2436 *ptr.add(wp) = b;
2437 wp += (b < lo || b > hi) as usize;
2438 ri += 1;
2439 }
2440 }
2441 wp
2442}
2443
2444#[cfg(target_arch = "x86_64")]
2447#[target_feature(enable = "avx2")]
2448unsafe fn delete_range_inplace_avx2(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2449 use std::arch::x86_64::*;
2450
2451 unsafe {
2452 let range = hi - lo;
2453 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2454 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2455 let zero = _mm256_setzero_si256();
2456
2457 let ptr = buf.as_mut_ptr();
2458 let mut ri = 0;
2459 let mut wp = 0;
2460
2461 while ri + 32 <= n {
2462 let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
2463 let biased = _mm256_add_epi8(input, bias_v);
2464 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2465 let in_range = _mm256_cmpeq_epi8(gt, zero);
2466 let del_mask = _mm256_movemask_epi8(in_range) as u32;
2467
2468 if del_mask == 0 {
2469 if wp != ri {
2471 std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
2472 }
2473 wp += 32;
2474 } else if del_mask != 0xFFFFFFFF {
2475 let keep_mask = !del_mask;
2480 let m0 = keep_mask as u8;
2481 let m1 = (keep_mask >> 8) as u8;
2482 let m2 = (keep_mask >> 16) as u8;
2483 let m3 = (keep_mask >> 24) as u8;
2484
2485 let c0 = m0.count_ones() as usize;
2486 let c1 = m1.count_ones() as usize;
2487 let c2 = m2.count_ones() as usize;
2488 let c3 = m3.count_ones() as usize;
2489
2490 if m0 == 0xFF {
2492 std::ptr::copy(ptr.add(ri), ptr.add(wp), 8);
2493 } else if m0 != 0 {
2494 let src_v = _mm_loadl_epi64(ptr.add(ri) as *const _);
2495 let shuf = _mm_loadl_epi64(COMPACT_LUT[m0 as usize].as_ptr() as *const _);
2496 let out_v = _mm_shuffle_epi8(src_v, shuf);
2497 _mm_storel_epi64(ptr.add(wp) as *mut _, out_v);
2498 }
2499
2500 if m1 == 0xFF {
2502 std::ptr::copy(ptr.add(ri + 8), ptr.add(wp + c0), 8);
2503 } else if m1 != 0 {
2504 let src_v = _mm_loadl_epi64(ptr.add(ri + 8) as *const _);
2505 let shuf = _mm_loadl_epi64(COMPACT_LUT[m1 as usize].as_ptr() as *const _);
2506 let out_v = _mm_shuffle_epi8(src_v, shuf);
2507 _mm_storel_epi64(ptr.add(wp + c0) as *mut _, out_v);
2508 }
2509
2510 if m2 == 0xFF {
2512 std::ptr::copy(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
2513 } else if m2 != 0 {
2514 let src_v = _mm_loadl_epi64(ptr.add(ri + 16) as *const _);
2515 let shuf = _mm_loadl_epi64(COMPACT_LUT[m2 as usize].as_ptr() as *const _);
2516 let out_v = _mm_shuffle_epi8(src_v, shuf);
2517 _mm_storel_epi64(ptr.add(wp + c0 + c1) as *mut _, out_v);
2518 }
2519
2520 if m3 == 0xFF {
2522 std::ptr::copy(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
2523 } else if m3 != 0 {
2524 let src_v = _mm_loadl_epi64(ptr.add(ri + 24) as *const _);
2525 let shuf = _mm_loadl_epi64(COMPACT_LUT[m3 as usize].as_ptr() as *const _);
2526 let out_v = _mm_shuffle_epi8(src_v, shuf);
2527 _mm_storel_epi64(ptr.add(wp + c0 + c1 + c2) as *mut _, out_v);
2528 }
2529
2530 wp += c0 + c1 + c2 + c3;
2531 }
2532 ri += 32;
2534 }
2535
2536 while ri < n {
2538 let b = *ptr.add(ri);
2539 *ptr.add(wp) = b;
2540 wp += (b < lo || b > hi) as usize;
2541 ri += 1;
2542 }
2543
2544 wp
2545 }
2546}
2547
2548pub fn translate(
2553 set1: &[u8],
2554 set2: &[u8],
2555 reader: &mut impl Read,
2556 writer: &mut impl Write,
2557) -> io::Result<()> {
2558 let table = build_translate_table(set1, set2);
2559
2560 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
2562 if is_identity {
2563 return passthrough_stream(reader, writer);
2564 }
2565
2566 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
2568 return translate_range_stream(lo, hi, offset, reader, writer);
2569 }
2570
2571 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
2574 return translate_range_to_constant_stream(lo, hi, replacement, reader, writer);
2575 }
2576
2577 let mut buf = alloc_uninit_vec(STREAM_BUF);
2586 loop {
2587 let n = read_once(reader, &mut buf)?;
2588 if n == 0 {
2589 break;
2590 }
2591 translate_inplace(&mut buf[..n], &table);
2592 writer.write_all(&buf[..n])?;
2593 }
2594 Ok(())
2595}
2596
2597fn translate_range_stream(
2600 lo: u8,
2601 hi: u8,
2602 offset: i8,
2603 reader: &mut impl Read,
2604 writer: &mut impl Write,
2605) -> io::Result<()> {
2606 let mut buf = alloc_uninit_vec(STREAM_BUF);
2607 loop {
2608 let n = read_once(reader, &mut buf)?;
2609 if n == 0 {
2610 break;
2611 }
2612 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2613 writer.write_all(&buf[..n])?;
2614 }
2615 Ok(())
2616}
2617
2618fn translate_range_to_constant_stream(
2621 lo: u8,
2622 hi: u8,
2623 replacement: u8,
2624 reader: &mut impl Read,
2625 writer: &mut impl Write,
2626) -> io::Result<()> {
2627 let mut buf = alloc_uninit_vec(STREAM_BUF);
2628 loop {
2629 let n = read_once(reader, &mut buf)?;
2630 if n == 0 {
2631 break;
2632 }
2633 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2634 writer.write_all(&buf[..n])?;
2635 }
2636 Ok(())
2637}
2638
2639fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
2642 let mut buf = alloc_uninit_vec(STREAM_BUF);
2643 loop {
2644 let n = read_once(reader, &mut buf)?;
2645 if n == 0 {
2646 break;
2647 }
2648 writer.write_all(&buf[..n])?;
2649 }
2650 Ok(())
2651}
2652
2653#[inline]
2659fn read_once(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2660 loop {
2661 match reader.read(buf) {
2662 Ok(n) => return Ok(n),
2663 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2664 Err(e) => return Err(e),
2665 }
2666 }
2667}
2668
2669pub fn translate_squeeze(
2670 set1: &[u8],
2671 set2: &[u8],
2672 reader: &mut impl Read,
2673 writer: &mut impl Write,
2674) -> io::Result<()> {
2675 let table = build_translate_table(set1, set2);
2676 let squeeze_set = build_member_set(set2);
2677
2678 if set2.len() == 1 || (set2.len() > 1 && set2.iter().all(|&b| b == set2[0])) {
2681 let squeeze_ch = set2.last().copied().unwrap_or(0);
2682 return translate_squeeze_single_ch(&table, squeeze_ch, &squeeze_set, reader, writer);
2683 }
2684
2685 let range_info = detect_range_offset(&table);
2689 let range_const_info = if range_info.is_none() {
2690 detect_range_to_constant(&table)
2691 } else {
2692 None
2693 };
2694
2695 let mut buf = alloc_uninit_vec(STREAM_BUF);
2696 let mut last_squeezed: u16 = 256;
2697
2698 loop {
2699 let n = read_once(reader, &mut buf)?;
2700 if n == 0 {
2701 break;
2702 }
2703 if let Some((lo, hi, offset)) = range_info {
2705 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2706 } else if let Some((lo, hi, replacement)) = range_const_info {
2707 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2708 } else {
2709 translate_inplace(&mut buf[..n], &table);
2710 }
2711 let mut wp = 0;
2713 unsafe {
2714 let ptr = buf.as_mut_ptr();
2715 let mut i = 0;
2716 while i + 8 <= n {
2717 macro_rules! squeeze_byte {
2718 ($off:expr) => {
2719 let b = *ptr.add(i + $off);
2720 if is_member(&squeeze_set, b) {
2721 if last_squeezed != b as u16 {
2722 last_squeezed = b as u16;
2723 *ptr.add(wp) = b;
2724 wp += 1;
2725 }
2726 } else {
2727 last_squeezed = 256;
2728 *ptr.add(wp) = b;
2729 wp += 1;
2730 }
2731 };
2732 }
2733 squeeze_byte!(0);
2734 squeeze_byte!(1);
2735 squeeze_byte!(2);
2736 squeeze_byte!(3);
2737 squeeze_byte!(4);
2738 squeeze_byte!(5);
2739 squeeze_byte!(6);
2740 squeeze_byte!(7);
2741 i += 8;
2742 }
2743 while i < n {
2744 let b = *ptr.add(i);
2745 if is_member(&squeeze_set, b) {
2746 if last_squeezed == b as u16 {
2747 i += 1;
2748 continue;
2749 }
2750 last_squeezed = b as u16;
2751 } else {
2752 last_squeezed = 256;
2753 }
2754 *ptr.add(wp) = b;
2755 wp += 1;
2756 i += 1;
2757 }
2758 }
2759 writer.write_all(&buf[..wp])?;
2760 }
2761 Ok(())
2762}
2763
2764fn translate_squeeze_single_ch(
2768 table: &[u8; 256],
2769 squeeze_ch: u8,
2770 _squeeze_set: &[u8; 32],
2771 reader: &mut impl Read,
2772 writer: &mut impl Write,
2773) -> io::Result<()> {
2774 let range_info = detect_range_offset(table);
2775 let range_const_info = if range_info.is_none() {
2776 detect_range_to_constant(table)
2777 } else {
2778 None
2779 };
2780
2781 let pair = [squeeze_ch, squeeze_ch];
2782 let finder = memchr::memmem::Finder::new(&pair);
2783 let mut buf = alloc_uninit_vec(STREAM_BUF);
2784 let mut was_squeeze_char = false;
2785
2786 loop {
2787 let n = read_once(reader, &mut buf)?;
2788 if n == 0 {
2789 break;
2790 }
2791 if let Some((lo, hi, offset)) = range_info {
2793 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2794 } else if let Some((lo, hi, replacement)) = range_const_info {
2795 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2796 } else {
2797 translate_inplace(&mut buf[..n], table);
2798 }
2799
2800 let mut i = 0;
2802
2803 if was_squeeze_char {
2805 while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2806 i += 1;
2807 }
2808 if i >= n {
2809 continue;
2810 }
2811 }
2812
2813 let ptr = buf.as_mut_ptr();
2814 let mut wp = 0usize;
2815
2816 loop {
2817 match finder.find(&buf[i..n]) {
2818 Some(offset) => {
2819 let seg_end = i + offset + 1;
2820 let gap = seg_end - i;
2821 if gap > 0 {
2822 if wp != i {
2823 unsafe {
2824 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
2825 }
2826 }
2827 wp += gap;
2828 }
2829 i = seg_end;
2830 while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2831 i += 1;
2832 }
2833 if i >= n {
2834 was_squeeze_char = true;
2835 break;
2836 }
2837 }
2838 None => {
2839 let rem = n - i;
2840 if rem > 0 {
2841 if wp != i {
2842 unsafe {
2843 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
2844 }
2845 }
2846 wp += rem;
2847 }
2848 was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == squeeze_ch;
2849 break;
2850 }
2851 }
2852 }
2853
2854 if wp > 0 {
2855 writer.write_all(&buf[..wp])?;
2856 }
2857 }
2858 Ok(())
2859}
2860
2861pub fn delete(
2862 delete_chars: &[u8],
2863 reader: &mut impl Read,
2864 writer: &mut impl Write,
2865) -> io::Result<()> {
2866 if delete_chars.len() == 1 {
2867 return delete_single_streaming(delete_chars[0], reader, writer);
2868 }
2869 if delete_chars.len() <= 3 {
2870 return delete_multi_streaming(delete_chars, reader, writer);
2871 }
2872
2873 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
2877 return delete_range_streaming(lo, hi, reader, writer);
2878 }
2879
2880 let member = build_member_set(delete_chars);
2881 let mut buf = alloc_uninit_vec(STREAM_BUF);
2882
2883 loop {
2884 let n = read_once(reader, &mut buf)?;
2885 if n == 0 {
2886 break;
2887 }
2888 let mut wp = 0;
2889 unsafe {
2890 let ptr = buf.as_mut_ptr();
2891 let mut i = 0;
2892 while i + 8 <= n {
2893 let b0 = *ptr.add(i);
2894 let b1 = *ptr.add(i + 1);
2895 let b2 = *ptr.add(i + 2);
2896 let b3 = *ptr.add(i + 3);
2897 let b4 = *ptr.add(i + 4);
2898 let b5 = *ptr.add(i + 5);
2899 let b6 = *ptr.add(i + 6);
2900 let b7 = *ptr.add(i + 7);
2901
2902 *ptr.add(wp) = b0;
2905 wp += !is_member(&member, b0) as usize;
2906 *ptr.add(wp) = b1;
2907 wp += !is_member(&member, b1) as usize;
2908 *ptr.add(wp) = b2;
2909 wp += !is_member(&member, b2) as usize;
2910 *ptr.add(wp) = b3;
2911 wp += !is_member(&member, b3) as usize;
2912 *ptr.add(wp) = b4;
2913 wp += !is_member(&member, b4) as usize;
2914 *ptr.add(wp) = b5;
2915 wp += !is_member(&member, b5) as usize;
2916 *ptr.add(wp) = b6;
2917 wp += !is_member(&member, b6) as usize;
2918 *ptr.add(wp) = b7;
2919 wp += !is_member(&member, b7) as usize;
2920 i += 8;
2921 }
2922 while i < n {
2923 let b = *ptr.add(i);
2924 *ptr.add(wp) = b;
2925 wp += !is_member(&member, b) as usize;
2926 i += 1;
2927 }
2928 }
2929 writer.write_all(&buf[..wp])?;
2930 }
2931 Ok(())
2932}
2933
2934fn delete_single_streaming(
2935 ch: u8,
2936 reader: &mut impl Read,
2937 writer: &mut impl Write,
2938) -> io::Result<()> {
2939 let mut buf = alloc_uninit_vec(STREAM_BUF);
2942 loop {
2943 let n = read_once(reader, &mut buf)?;
2944 if n == 0 {
2945 break;
2946 }
2947 let mut wp = 0;
2948 let mut i = 0;
2949 while i < n {
2950 match memchr::memchr(ch, &buf[i..n]) {
2951 Some(offset) => {
2952 if offset > 0 {
2953 if wp != i {
2954 unsafe {
2955 std::ptr::copy(
2956 buf.as_ptr().add(i),
2957 buf.as_mut_ptr().add(wp),
2958 offset,
2959 );
2960 }
2961 }
2962 wp += offset;
2963 }
2964 i += offset + 1;
2965 }
2966 None => {
2967 let run_len = n - i;
2968 if run_len > 0 {
2969 if wp != i {
2970 unsafe {
2971 std::ptr::copy(
2972 buf.as_ptr().add(i),
2973 buf.as_mut_ptr().add(wp),
2974 run_len,
2975 );
2976 }
2977 }
2978 wp += run_len;
2979 }
2980 break;
2981 }
2982 }
2983 }
2984 if wp > 0 {
2985 writer.write_all(&buf[..wp])?;
2986 }
2987 }
2988 Ok(())
2989}
2990
2991fn delete_multi_streaming(
2992 chars: &[u8],
2993 reader: &mut impl Read,
2994 writer: &mut impl Write,
2995) -> io::Result<()> {
2996 let mut buf = alloc_uninit_vec(STREAM_BUF);
2999 loop {
3000 let n = read_once(reader, &mut buf)?;
3001 if n == 0 {
3002 break;
3003 }
3004 let mut wp = 0;
3005 let mut i = 0;
3006 while i < n {
3007 let found = if chars.len() == 2 {
3008 memchr::memchr2(chars[0], chars[1], &buf[i..n])
3009 } else {
3010 memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
3011 };
3012 match found {
3013 Some(offset) => {
3014 if offset > 0 {
3015 if wp != i {
3016 unsafe {
3017 std::ptr::copy(
3018 buf.as_ptr().add(i),
3019 buf.as_mut_ptr().add(wp),
3020 offset,
3021 );
3022 }
3023 }
3024 wp += offset;
3025 }
3026 i += offset + 1;
3027 }
3028 None => {
3029 let run_len = n - i;
3030 if run_len > 0 {
3031 if wp != i {
3032 unsafe {
3033 std::ptr::copy(
3034 buf.as_ptr().add(i),
3035 buf.as_mut_ptr().add(wp),
3036 run_len,
3037 );
3038 }
3039 }
3040 wp += run_len;
3041 }
3042 break;
3043 }
3044 }
3045 }
3046 if wp > 0 {
3047 writer.write_all(&buf[..wp])?;
3048 }
3049 }
3050 Ok(())
3051}
3052
3053pub fn delete_squeeze(
3054 delete_chars: &[u8],
3055 squeeze_chars: &[u8],
3056 reader: &mut impl Read,
3057 writer: &mut impl Write,
3058) -> io::Result<()> {
3059 let delete_set = build_member_set(delete_chars);
3060 let squeeze_set = build_member_set(squeeze_chars);
3061 let mut buf = alloc_uninit_vec(STREAM_BUF);
3062 let mut last_squeezed: u16 = 256;
3063
3064 loop {
3065 let n = read_once(reader, &mut buf)?;
3066 if n == 0 {
3067 break;
3068 }
3069 let mut wp = 0;
3073 unsafe {
3074 let ptr = buf.as_mut_ptr();
3075 let mut i = 0;
3076 while i + 8 <= n {
3077 macro_rules! process_byte {
3078 ($off:expr) => {
3079 let b = *ptr.add(i + $off);
3080 if !is_member(&delete_set, b) {
3081 if is_member(&squeeze_set, b) {
3082 if last_squeezed != b as u16 {
3083 last_squeezed = b as u16;
3084 *ptr.add(wp) = b;
3085 wp += 1;
3086 }
3087 } else {
3088 last_squeezed = 256;
3089 *ptr.add(wp) = b;
3090 wp += 1;
3091 }
3092 }
3093 };
3094 }
3095 process_byte!(0);
3096 process_byte!(1);
3097 process_byte!(2);
3098 process_byte!(3);
3099 process_byte!(4);
3100 process_byte!(5);
3101 process_byte!(6);
3102 process_byte!(7);
3103 i += 8;
3104 }
3105 while i < n {
3106 let b = *ptr.add(i);
3107 if !is_member(&delete_set, b) {
3108 if is_member(&squeeze_set, b) {
3109 if last_squeezed != b as u16 {
3110 last_squeezed = b as u16;
3111 *ptr.add(wp) = b;
3112 wp += 1;
3113 }
3114 } else {
3115 last_squeezed = 256;
3116 *ptr.add(wp) = b;
3117 wp += 1;
3118 }
3119 }
3120 i += 1;
3121 }
3122 }
3123 writer.write_all(&buf[..wp])?;
3124 }
3125 Ok(())
3126}
3127
3128pub fn squeeze(
3129 squeeze_chars: &[u8],
3130 reader: &mut impl Read,
3131 writer: &mut impl Write,
3132) -> io::Result<()> {
3133 if squeeze_chars.len() == 1 {
3134 return squeeze_single_stream(squeeze_chars[0], reader, writer);
3135 }
3136
3137 if squeeze_chars.len() <= 3 {
3140 return squeeze_multi_stream(squeeze_chars, reader, writer);
3141 }
3142
3143 let member = build_member_set(squeeze_chars);
3144 let mut buf = alloc_uninit_vec(STREAM_BUF);
3145 let mut last_squeezed: u16 = 256;
3146
3147 loop {
3148 let n = read_once(reader, &mut buf)?;
3149 if n == 0 {
3150 break;
3151 }
3152 let mut wp = 0;
3153 unsafe {
3154 let ptr = buf.as_mut_ptr();
3155 for i in 0..n {
3156 let b = *ptr.add(i);
3157 if is_member(&member, b) {
3158 if last_squeezed == b as u16 {
3159 continue;
3160 }
3161 last_squeezed = b as u16;
3162 } else {
3163 last_squeezed = 256;
3164 }
3165 *ptr.add(wp) = b;
3166 wp += 1;
3167 }
3168 }
3169 writer.write_all(&buf[..wp])?;
3170 }
3171 Ok(())
3172}
3173
3174fn squeeze_multi_stream(
3178 chars: &[u8],
3179 reader: &mut impl Read,
3180 writer: &mut impl Write,
3181) -> io::Result<()> {
3182 let c0 = chars[0];
3183 let c1 = chars[1];
3184 let c2 = if chars.len() >= 3 {
3185 Some(chars[2])
3186 } else {
3187 None
3188 };
3189 let single_byte = [0u8; 1]; let _ = single_byte;
3191
3192 let mut buf = alloc_uninit_vec(STREAM_BUF);
3193 let mut last_squeezed: u16 = 256;
3194
3195 loop {
3196 let n = read_once(reader, &mut buf)?;
3197 if n == 0 {
3198 break;
3199 }
3200
3201 let ptr = buf.as_mut_ptr();
3205 let mut wp = 0usize;
3206 let mut cursor = 0usize;
3207
3208 macro_rules! find_next {
3209 ($start:expr) => {
3210 if let Some(c) = c2 {
3211 memchr::memchr3(c0, c1, c, &buf[$start..n])
3212 } else {
3213 memchr::memchr2(c0, c1, &buf[$start..n])
3214 }
3215 };
3216 }
3217
3218 while cursor < n {
3219 match find_next!(cursor) {
3220 Some(offset) => {
3221 let pos = cursor + offset;
3222 let b = unsafe { *ptr.add(pos) };
3223
3224 let gap = pos - cursor;
3226 if gap > 0 {
3227 if wp != cursor {
3228 unsafe {
3229 std::ptr::copy(ptr.add(cursor), ptr.add(wp), gap);
3230 }
3231 }
3232 wp += gap;
3233 last_squeezed = 256;
3234 }
3235
3236 if last_squeezed != b as u16 {
3238 unsafe { *ptr.add(wp) = b };
3239 wp += 1;
3240 last_squeezed = b as u16;
3241 }
3242
3243 cursor = pos + 1;
3245 while cursor < n && unsafe { *ptr.add(cursor) } == b {
3246 cursor += 1;
3247 }
3248 }
3249 None => {
3250 let rem = n - cursor;
3252 if rem > 0 {
3253 if wp != cursor {
3254 unsafe {
3255 std::ptr::copy(ptr.add(cursor), ptr.add(wp), rem);
3256 }
3257 }
3258 wp += rem;
3259 last_squeezed = 256;
3260 }
3261 break;
3262 }
3263 }
3264 }
3265
3266 writer.write_all(&buf[..wp])?;
3267 }
3268 Ok(())
3269}
3270
3271fn squeeze_single_stream(
3272 ch: u8,
3273 reader: &mut impl Read,
3274 writer: &mut impl Write,
3275) -> io::Result<()> {
3276 let pair = [ch, ch];
3280 let finder = memchr::memmem::Finder::new(&pair);
3281 let mut buf = alloc_uninit_vec(STREAM_BUF);
3282 let mut was_squeeze_char = false;
3283
3284 loop {
3285 let n = read_once(reader, &mut buf)?;
3286 if n == 0 {
3287 break;
3288 }
3289
3290 let mut i = 0;
3291
3292 if was_squeeze_char {
3295 while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3296 i += 1;
3297 }
3298 if i >= n {
3299 continue;
3300 }
3301 }
3302
3303 let ptr = buf.as_mut_ptr();
3305 let mut wp = 0usize;
3306
3307 loop {
3308 match finder.find(&buf[i..n]) {
3309 Some(offset) => {
3310 let seg_end = i + offset + 1;
3312 let gap = seg_end - i;
3313 if gap > 0 {
3314 if wp != i {
3315 unsafe {
3316 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
3317 }
3318 }
3319 wp += gap;
3320 }
3321 i = seg_end;
3322 while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3324 i += 1;
3325 }
3326 if i >= n {
3327 was_squeeze_char = true;
3328 break;
3329 }
3330 }
3331 None => {
3332 let rem = n - i;
3334 if rem > 0 {
3335 if wp != i {
3336 unsafe {
3337 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
3338 }
3339 }
3340 wp += rem;
3341 }
3342 was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == ch;
3343 break;
3344 }
3345 }
3346 }
3347
3348 if wp > 0 {
3349 writer.write_all(&buf[..wp])?;
3350 }
3351 }
3352 Ok(())
3353}
3354
3355pub fn translate_owned(
3363 set1: &[u8],
3364 set2: &[u8],
3365 data: &mut [u8],
3366 writer: &mut impl Write,
3367) -> io::Result<()> {
3368 let table = build_translate_table(set1, set2);
3369
3370 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3372 if is_identity {
3373 return writer.write_all(data);
3374 }
3375
3376 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3378 if data.len() >= PARALLEL_THRESHOLD {
3379 let n_threads = rayon::current_num_threads().max(1);
3380 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3381 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3382 translate_range_simd_inplace(chunk, lo, hi, offset);
3383 });
3384 } else {
3385 translate_range_simd_inplace(data, lo, hi, offset);
3386 }
3387 return writer.write_all(data);
3388 }
3389
3390 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3392 if data.len() >= PARALLEL_THRESHOLD {
3393 let n_threads = rayon::current_num_threads().max(1);
3394 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3395 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3396 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
3397 });
3398 } else {
3399 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3400 }
3401 return writer.write_all(data);
3402 }
3403
3404 if data.len() >= PARALLEL_THRESHOLD {
3406 let n_threads = rayon::current_num_threads().max(1);
3407 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3408 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3409 translate_inplace(chunk, &table);
3410 });
3411 } else {
3412 translate_inplace(data, &table);
3413 }
3414 writer.write_all(data)
3415}
3416
3417pub fn translate_mmap(
3431 set1: &[u8],
3432 set2: &[u8],
3433 data: &[u8],
3434 writer: &mut impl Write,
3435) -> io::Result<()> {
3436 let table = build_translate_table(set1, set2);
3437
3438 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3440 if is_identity {
3441 return writer.write_all(data);
3442 }
3443
3444 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3446 return translate_mmap_range(data, writer, lo, hi, offset);
3447 }
3448
3449 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3451 return translate_mmap_range_to_constant(data, writer, lo, hi, replacement);
3452 }
3453
3454 translate_mmap_table(data, writer, &table)
3456}
3457
3458fn translate_mmap_range(
3460 data: &[u8],
3461 writer: &mut impl Write,
3462 lo: u8,
3463 hi: u8,
3464 offset: i8,
3465) -> io::Result<()> {
3466 if data.len() >= PARALLEL_THRESHOLD {
3468 let mut buf = alloc_uninit_vec(data.len());
3469 let n_threads = rayon::current_num_threads().max(1);
3470 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3471
3472 data.par_chunks(chunk_size)
3474 .zip(buf.par_chunks_mut(chunk_size))
3475 .for_each(|(src_chunk, dst_chunk)| {
3476 translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
3477 });
3478
3479 return writer.write_all(&buf);
3480 }
3481
3482 const CHUNK: usize = 256 * 1024;
3484 let buf_size = data.len().min(CHUNK);
3485 let mut buf = alloc_uninit_vec(buf_size);
3486 for chunk in data.chunks(CHUNK) {
3487 translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
3488 writer.write_all(&buf[..chunk.len()])?;
3489 }
3490 Ok(())
3491}
3492
3493fn translate_mmap_range_to_constant(
3496 data: &[u8],
3497 writer: &mut impl Write,
3498 lo: u8,
3499 hi: u8,
3500 replacement: u8,
3501) -> io::Result<()> {
3502 if data.len() >= PARALLEL_THRESHOLD {
3504 let mut buf = alloc_uninit_vec(data.len());
3505 let n_threads = rayon::current_num_threads().max(1);
3506 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3507
3508 data.par_chunks(chunk_size)
3510 .zip(buf.par_chunks_mut(chunk_size))
3511 .for_each(|(src_chunk, dst_chunk)| {
3512 dst_chunk[..src_chunk.len()].copy_from_slice(src_chunk);
3513 translate_range_to_constant_simd_inplace(
3514 &mut dst_chunk[..src_chunk.len()],
3515 lo,
3516 hi,
3517 replacement,
3518 );
3519 });
3520
3521 return writer.write_all(&buf);
3522 }
3523
3524 const CHUNK: usize = 256 * 1024;
3526 let buf_size = data.len().min(CHUNK);
3527 let mut buf = alloc_uninit_vec(buf_size);
3528 for chunk in data.chunks(CHUNK) {
3529 buf[..chunk.len()].copy_from_slice(chunk);
3530 translate_range_to_constant_simd_inplace(&mut buf[..chunk.len()], lo, hi, replacement);
3531 writer.write_all(&buf[..chunk.len()])?;
3532 }
3533 Ok(())
3534}
3535
3536fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
3538 if data.len() >= PARALLEL_THRESHOLD {
3540 let mut buf = alloc_uninit_vec(data.len());
3541 let n_threads = rayon::current_num_threads().max(1);
3542 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3543
3544 data.par_chunks(chunk_size)
3545 .zip(buf.par_chunks_mut(chunk_size))
3546 .for_each(|(src_chunk, dst_chunk)| {
3547 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
3548 });
3549
3550 return writer.write_all(&buf);
3551 }
3552
3553 const CHUNK: usize = 256 * 1024;
3555 let buf_size = data.len().min(CHUNK);
3556 let mut buf = alloc_uninit_vec(buf_size);
3557 for chunk in data.chunks(CHUNK) {
3558 translate_to(chunk, &mut buf[..chunk.len()], table);
3559 writer.write_all(&buf[..chunk.len()])?;
3560 }
3561 Ok(())
3562}
3563
3564pub fn translate_mmap_inplace(
3571 set1: &[u8],
3572 set2: &[u8],
3573 data: &mut [u8],
3574 writer: &mut impl Write,
3575) -> io::Result<()> {
3576 let table = build_translate_table(set1, set2);
3577
3578 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3580 if is_identity {
3581 return writer.write_all(data);
3582 }
3583
3584 const SEPARATE_BUF_THRESHOLD: usize = 64 * 1024 * 1024;
3591
3592 if data.len() < SEPARATE_BUF_THRESHOLD {
3593 return translate_to_separate_buf(data, &table, writer);
3594 }
3595
3596 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3598 if data.len() >= PARALLEL_THRESHOLD {
3599 let n_threads = rayon::current_num_threads().max(1);
3600 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3601 data.par_chunks_mut(chunk_size)
3602 .for_each(|chunk| translate_range_simd_inplace(chunk, lo, hi, offset));
3603 } else {
3604 translate_range_simd_inplace(data, lo, hi, offset);
3605 }
3606 return writer.write_all(data);
3607 }
3608
3609 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3611 if data.len() >= PARALLEL_THRESHOLD {
3612 let n_threads = rayon::current_num_threads().max(1);
3613 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3614 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3615 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement)
3616 });
3617 } else {
3618 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3619 }
3620 return writer.write_all(data);
3621 }
3622
3623 if data.len() >= PARALLEL_THRESHOLD {
3625 let n_threads = rayon::current_num_threads().max(1);
3626 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3627 data.par_chunks_mut(chunk_size)
3628 .for_each(|chunk| translate_inplace(chunk, &table));
3629 } else {
3630 translate_inplace(data, &table);
3631 }
3632 writer.write_all(data)
3633}
3634
3635fn translate_to_separate_buf(
3643 data: &[u8],
3644 table: &[u8; 256],
3645 writer: &mut impl Write,
3646) -> io::Result<()> {
3647 let range_info = detect_range_offset(table);
3648 let const_info = if range_info.is_none() {
3649 detect_range_to_constant(table)
3650 } else {
3651 None
3652 };
3653
3654 if data.len() >= PARALLEL_THRESHOLD {
3655 let mut out_buf = alloc_uninit_vec(data.len());
3657 let n_threads = rayon::current_num_threads().max(1);
3658 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3659
3660 if let Some((lo, hi, offset)) = range_info {
3661 data.par_chunks(chunk_size)
3662 .zip(out_buf.par_chunks_mut(chunk_size))
3663 .for_each(|(src, dst)| {
3664 translate_range_simd(src, &mut dst[..src.len()], lo, hi, offset);
3665 });
3666 } else if let Some((lo, hi, replacement)) = const_info {
3667 data.par_chunks(chunk_size)
3668 .zip(out_buf.par_chunks_mut(chunk_size))
3669 .for_each(|(src, dst)| {
3670 translate_range_to_constant_simd(
3671 src,
3672 &mut dst[..src.len()],
3673 lo,
3674 hi,
3675 replacement,
3676 );
3677 });
3678 } else {
3679 data.par_chunks(chunk_size)
3680 .zip(out_buf.par_chunks_mut(chunk_size))
3681 .for_each(|(src, dst)| {
3682 translate_to(src, &mut dst[..src.len()], table);
3683 });
3684 }
3685 return writer.write_all(&out_buf);
3686 }
3687
3688 let mut out_buf = alloc_uninit_vec(data.len());
3694 if let Some((lo, hi, offset)) = range_info {
3695 translate_range_simd(data, &mut out_buf, lo, hi, offset);
3696 } else if let Some((lo, hi, replacement)) = const_info {
3697 translate_range_to_constant_simd(data, &mut out_buf, lo, hi, replacement);
3698 } else {
3699 translate_to(data, &mut out_buf, table);
3700 }
3701 writer.write_all(&out_buf)
3702}
3703
3704pub fn translate_mmap_readonly(
3708 set1: &[u8],
3709 set2: &[u8],
3710 data: &[u8],
3711 writer: &mut impl Write,
3712) -> io::Result<()> {
3713 let table = build_translate_table(set1, set2);
3714
3715 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3717 if is_identity {
3718 return writer.write_all(data);
3719 }
3720
3721 translate_to_separate_buf(data, &table, writer)
3722}
3723
3724pub fn translate_squeeze_mmap(
3730 set1: &[u8],
3731 set2: &[u8],
3732 data: &[u8],
3733 writer: &mut impl Write,
3734) -> io::Result<()> {
3735 let table = build_translate_table(set1, set2);
3736 let squeeze_set = build_member_set(set2);
3737
3738 if data.len() >= PARALLEL_THRESHOLD {
3743 let mut translated = alloc_uninit_vec(data.len());
3745 let range_info = detect_range_offset(&table);
3746 let n_threads = rayon::current_num_threads().max(1);
3747 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3748
3749 if let Some((lo, hi, offset)) = range_info {
3750 data.par_chunks(chunk_size)
3751 .zip(translated.par_chunks_mut(chunk_size))
3752 .for_each(|(src_chunk, dst_chunk)| {
3753 translate_range_simd(
3754 src_chunk,
3755 &mut dst_chunk[..src_chunk.len()],
3756 lo,
3757 hi,
3758 offset,
3759 );
3760 });
3761 } else {
3762 data.par_chunks(chunk_size)
3763 .zip(translated.par_chunks_mut(chunk_size))
3764 .for_each(|(src_chunk, dst_chunk)| {
3765 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
3766 });
3767 }
3768
3769 let mut last_squeezed: u16 = 256;
3773 let len = translated.len();
3774 let mut wp = 0;
3775 unsafe {
3776 let ptr = translated.as_mut_ptr();
3777 let mut i = 0;
3778 while i < len {
3779 let b = *ptr.add(i);
3780 if is_member(&squeeze_set, b) {
3781 if last_squeezed == b as u16 {
3782 i += 1;
3783 continue;
3784 }
3785 last_squeezed = b as u16;
3786 } else {
3787 last_squeezed = 256;
3788 }
3789 *ptr.add(wp) = b;
3790 wp += 1;
3791 i += 1;
3792 }
3793 }
3794 return writer.write_all(&translated[..wp]);
3795 }
3796
3797 let mut buf = alloc_uninit_vec(data.len());
3800 translate_to(data, &mut buf, &table);
3801 let mut last_squeezed: u16 = 256;
3802 let mut wp = 0;
3803 unsafe {
3804 let ptr = buf.as_mut_ptr();
3805 for i in 0..data.len() {
3806 let b = *ptr.add(i);
3807 if is_member(&squeeze_set, b) {
3808 if last_squeezed == b as u16 {
3809 continue;
3810 }
3811 last_squeezed = b as u16;
3812 } else {
3813 last_squeezed = 256;
3814 }
3815 *ptr.add(wp) = b;
3816 wp += 1;
3817 }
3818 }
3819 writer.write_all(&buf[..wp])
3820}
3821
3822pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
3828 if delete_chars.len() == 1 {
3829 return delete_single_char_mmap(delete_chars[0], data, writer);
3830 }
3831 if delete_chars.len() <= 3 {
3832 return delete_multi_memchr_mmap(delete_chars, data, writer);
3833 }
3834
3835 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
3837 return delete_range_mmap(data, writer, lo, hi);
3838 }
3839
3840 let member = build_member_set(delete_chars);
3841
3842 let sample_size = data.len().min(1024);
3847 let sample_deletes = data[..sample_size]
3848 .iter()
3849 .filter(|&&b| is_member(&member, b))
3850 .count();
3851 let estimated_deletes = if sample_size > 0 {
3852 data.len() * sample_deletes / sample_size
3853 } else {
3854 data.len()
3855 };
3856
3857 if estimated_deletes < MAX_IOV / 2 {
3858 return delete_bitset_zerocopy(data, &member, writer);
3859 }
3860
3861 if data.len() >= PARALLEL_THRESHOLD {
3863 let n_threads = rayon::current_num_threads().max(1);
3864 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3865
3866 let mut outbuf = alloc_uninit_vec(data.len());
3867 let chunk_lens: Vec<usize> = data
3868 .par_chunks(chunk_size)
3869 .zip(outbuf.par_chunks_mut(chunk_size))
3870 .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
3871 .collect();
3872
3873 let slices: Vec<std::io::IoSlice> = chunk_lens
3877 .iter()
3878 .enumerate()
3879 .filter(|&(_, &len)| len > 0)
3880 .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
3881 .collect();
3882 return write_ioslices(writer, &slices);
3883 }
3884
3885 const COMPACT_BUF: usize = 256 * 1024;
3888 let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
3889
3890 for chunk in data.chunks(COMPACT_BUF) {
3891 let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
3892 if out_pos > 0 {
3893 writer.write_all(&outbuf[..out_pos])?;
3894 }
3895 }
3896 Ok(())
3897}
3898
3899fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
3904 let sample_size = data.len().min(1024);
3906 let sample_deletes = data[..sample_size]
3907 .iter()
3908 .filter(|&&b| b >= lo && b <= hi)
3909 .count();
3910 let estimated_deletes = if sample_size > 0 {
3916 data.len() * sample_deletes / sample_size
3917 } else {
3918 data.len()
3919 };
3920 if estimated_deletes < MAX_IOV / 2 {
3921 return delete_range_mmap_zerocopy(data, writer, lo, hi);
3922 }
3923
3924 if data.len() >= PARALLEL_THRESHOLD {
3926 let n_threads = rayon::current_num_threads().max(1);
3927 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3928
3929 let mut outbuf = alloc_uninit_vec(data.len());
3930 let chunk_lens: Vec<usize> = data
3931 .par_chunks(chunk_size)
3932 .zip(outbuf.par_chunks_mut(chunk_size))
3933 .map(|(src_chunk, dst_chunk)| delete_range_chunk(src_chunk, dst_chunk, lo, hi))
3934 .collect();
3935
3936 let slices: Vec<std::io::IoSlice> = chunk_lens
3939 .iter()
3940 .enumerate()
3941 .filter(|&(_, &len)| len > 0)
3942 .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
3943 .collect();
3944 return write_ioslices(writer, &slices);
3945 }
3946
3947 const COMPACT_BUF: usize = 256 * 1024;
3951 let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
3952
3953 #[cfg(target_arch = "x86_64")]
3954 {
3955 let mut wp = 0;
3956 let level = get_simd_level();
3957 let len = data.len();
3958 let sp = data.as_ptr();
3959 let dp = outbuf.as_mut_ptr();
3960 let mut ri = 0;
3961
3962 if level >= 3 {
3963 use std::arch::x86_64::*;
3964 let range = hi - lo;
3965 let bias_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8) };
3966 let threshold_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8) };
3967 let zero = unsafe { _mm256_setzero_si256() };
3968
3969 while ri + 32 <= len {
3970 if wp + 32 > COMPACT_BUF {
3972 writer.write_all(&outbuf[..wp])?;
3973 wp = 0;
3974 }
3975
3976 let input = unsafe { _mm256_loadu_si256(sp.add(ri) as *const _) };
3977 let biased = unsafe { _mm256_add_epi8(input, bias_v) };
3978 let gt = unsafe { _mm256_cmpgt_epi8(biased, threshold_v) };
3979 let in_range = unsafe { _mm256_cmpeq_epi8(gt, zero) };
3980 let keep_mask = !(unsafe { _mm256_movemask_epi8(in_range) } as u32);
3981
3982 if keep_mask == 0xFFFFFFFF {
3983 unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32) };
3984 wp += 32;
3985 } else if keep_mask != 0 {
3986 let m0 = keep_mask as u8;
3987 let m1 = (keep_mask >> 8) as u8;
3988 let m2 = (keep_mask >> 16) as u8;
3989 let m3 = (keep_mask >> 24) as u8;
3990
3991 if m0 == 0xFF {
3992 unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8) };
3993 } else if m0 != 0 {
3994 unsafe { compact_8bytes(sp.add(ri), dp.add(wp), m0) };
3995 }
3996 let c0 = m0.count_ones() as usize;
3997
3998 if m1 == 0xFF {
3999 unsafe {
4000 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8)
4001 };
4002 } else if m1 != 0 {
4003 unsafe { compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1) };
4004 }
4005 let c1 = m1.count_ones() as usize;
4006
4007 if m2 == 0xFF {
4008 unsafe {
4009 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8)
4010 };
4011 } else if m2 != 0 {
4012 unsafe { compact_8bytes(sp.add(ri + 16), dp.add(wp + c0 + c1), m2) };
4013 }
4014 let c2 = m2.count_ones() as usize;
4015
4016 if m3 == 0xFF {
4017 unsafe {
4018 std::ptr::copy_nonoverlapping(
4019 sp.add(ri + 24),
4020 dp.add(wp + c0 + c1 + c2),
4021 8,
4022 )
4023 };
4024 } else if m3 != 0 {
4025 unsafe { compact_8bytes(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3) };
4026 }
4027 let c3 = m3.count_ones() as usize;
4028 wp += c0 + c1 + c2 + c3;
4029 }
4030 ri += 32;
4031 }
4032 }
4033
4034 while ri < len {
4036 if wp + 1 > COMPACT_BUF {
4037 writer.write_all(&outbuf[..wp])?;
4038 wp = 0;
4039 }
4040 let b = unsafe { *sp.add(ri) };
4041 unsafe { *dp.add(wp) = b };
4042 wp += (b < lo || b > hi) as usize;
4043 ri += 1;
4044 }
4045
4046 if wp > 0 {
4047 writer.write_all(&outbuf[..wp])?;
4048 }
4049 return Ok(());
4050 }
4051
4052 #[cfg(not(target_arch = "x86_64"))]
4053 {
4054 for chunk in data.chunks(COMPACT_BUF) {
4056 let clen = delete_range_chunk(chunk, &mut outbuf, lo, hi);
4057 if clen > 0 {
4058 writer.write_all(&outbuf[..clen])?;
4059 }
4060 }
4061 return Ok(());
4062 }
4063
4064 #[allow(unreachable_code)]
4065 Ok(())
4066}
4067
4068fn delete_range_mmap_zerocopy(
4073 data: &[u8],
4074 writer: &mut impl Write,
4075 lo: u8,
4076 hi: u8,
4077) -> io::Result<()> {
4078 #[cfg(target_arch = "x86_64")]
4079 {
4080 if get_simd_level() >= 3 {
4081 return unsafe { delete_range_zerocopy_avx2(data, writer, lo, hi) };
4082 }
4083 if get_simd_level() >= 2 {
4084 return unsafe { delete_range_zerocopy_sse2(data, writer, lo, hi) };
4085 }
4086 }
4087
4088 #[cfg(target_arch = "aarch64")]
4089 {
4090 return unsafe { delete_range_zerocopy_neon(data, writer, lo, hi) };
4091 }
4092
4093 #[allow(unreachable_code)]
4095 delete_range_zerocopy_scalar(data, writer, lo, hi)
4096}
4097
4098fn delete_range_zerocopy_scalar(
4101 data: &[u8],
4102 writer: &mut impl Write,
4103 lo: u8,
4104 hi: u8,
4105) -> io::Result<()> {
4106 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4107 let len = data.len();
4108 let mut run_start: usize = 0;
4109 let mut i: usize = 0;
4110
4111 while i < len {
4112 let b = unsafe { *data.get_unchecked(i) };
4113 if b >= lo && b <= hi {
4114 if i > run_start {
4115 iov.push(std::io::IoSlice::new(&data[run_start..i]));
4116 if iov.len() >= MAX_IOV {
4117 write_ioslices(writer, &iov)?;
4118 iov.clear();
4119 }
4120 }
4121 run_start = i + 1;
4122 }
4123 i += 1;
4124 }
4125 if run_start < len {
4126 iov.push(std::io::IoSlice::new(&data[run_start..]));
4127 }
4128 if !iov.is_empty() {
4129 write_ioslices(writer, &iov)?;
4130 }
4131 Ok(())
4132}
4133
4134#[cfg(target_arch = "x86_64")]
4138#[target_feature(enable = "avx2")]
4139unsafe fn delete_range_zerocopy_avx2(
4140 data: &[u8],
4141 writer: &mut impl Write,
4142 lo: u8,
4143 hi: u8,
4144) -> io::Result<()> {
4145 use std::arch::x86_64::*;
4146
4147 unsafe {
4148 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4149 let len = data.len();
4150 let mut run_start: usize = 0;
4151 let mut ri: usize = 0;
4152
4153 let range = hi - lo;
4154 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4155 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
4156 let zero = _mm256_setzero_si256();
4157
4158 while ri + 32 <= len {
4159 let input = _mm256_loadu_si256(data.as_ptr().add(ri) as *const _);
4160 let biased = _mm256_add_epi8(input, bias_v);
4161 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
4162 let in_range = _mm256_cmpeq_epi8(gt, zero);
4163 let del_mask = _mm256_movemask_epi8(in_range) as u32;
4164
4165 if del_mask == 0 {
4166 ri += 32;
4168 continue;
4169 }
4170
4171 let mut m = del_mask;
4173 while m != 0 {
4174 let bit = m.trailing_zeros() as usize;
4175 let abs_pos = ri + bit;
4176 if abs_pos > run_start {
4177 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4178 if iov.len() >= MAX_IOV {
4179 write_ioslices(writer, &iov)?;
4180 iov.clear();
4181 }
4182 }
4183 run_start = abs_pos + 1;
4184 m &= m - 1; }
4186
4187 ri += 32;
4188 }
4189
4190 while ri < len {
4192 let b = *data.get_unchecked(ri);
4193 if b >= lo && b <= hi {
4194 if ri > run_start {
4195 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4196 if iov.len() >= MAX_IOV {
4197 write_ioslices(writer, &iov)?;
4198 iov.clear();
4199 }
4200 }
4201 run_start = ri + 1;
4202 }
4203 ri += 1;
4204 }
4205
4206 if run_start < len {
4207 iov.push(std::io::IoSlice::new(&data[run_start..]));
4208 }
4209 if !iov.is_empty() {
4210 write_ioslices(writer, &iov)?;
4211 }
4212 Ok(())
4213 }
4214}
4215
4216#[cfg(target_arch = "x86_64")]
4218#[target_feature(enable = "sse2")]
4219unsafe fn delete_range_zerocopy_sse2(
4220 data: &[u8],
4221 writer: &mut impl Write,
4222 lo: u8,
4223 hi: u8,
4224) -> io::Result<()> {
4225 use std::arch::x86_64::*;
4226
4227 unsafe {
4228 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4229 let len = data.len();
4230 let mut run_start: usize = 0;
4231 let mut ri: usize = 0;
4232
4233 let range = hi - lo;
4234 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4235 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
4236 let zero = _mm_setzero_si128();
4237
4238 while ri + 16 <= len {
4239 let input = _mm_loadu_si128(data.as_ptr().add(ri) as *const _);
4240 let biased = _mm_add_epi8(input, bias_v);
4241 let gt = _mm_cmpgt_epi8(biased, threshold_v);
4242 let in_range = _mm_cmpeq_epi8(gt, zero);
4243 let del_mask = _mm_movemask_epi8(in_range) as u32 & 0xFFFF;
4244
4245 if del_mask == 0 {
4246 ri += 16;
4247 continue;
4248 }
4249
4250 let mut m = del_mask;
4251 while m != 0 {
4252 let bit = m.trailing_zeros() as usize;
4253 let abs_pos = ri + bit;
4254 if abs_pos > run_start {
4255 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4256 if iov.len() >= MAX_IOV {
4257 write_ioslices(writer, &iov)?;
4258 iov.clear();
4259 }
4260 }
4261 run_start = abs_pos + 1;
4262 m &= m - 1;
4263 }
4264
4265 ri += 16;
4266 }
4267
4268 while ri < len {
4269 let b = *data.get_unchecked(ri);
4270 if b >= lo && b <= hi {
4271 if ri > run_start {
4272 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4273 if iov.len() >= MAX_IOV {
4274 write_ioslices(writer, &iov)?;
4275 iov.clear();
4276 }
4277 }
4278 run_start = ri + 1;
4279 }
4280 ri += 1;
4281 }
4282
4283 if run_start < len {
4284 iov.push(std::io::IoSlice::new(&data[run_start..]));
4285 }
4286 if !iov.is_empty() {
4287 write_ioslices(writer, &iov)?;
4288 }
4289 Ok(())
4290 }
4291}
4292
4293#[cfg(target_arch = "aarch64")]
4297#[target_feature(enable = "neon")]
4298unsafe fn delete_range_zerocopy_neon(
4299 data: &[u8],
4300 writer: &mut impl Write,
4301 lo: u8,
4302 hi: u8,
4303) -> io::Result<()> {
4304 use std::arch::aarch64::*;
4305
4306 unsafe {
4307 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4308 let len = data.len();
4309 let mut run_start: usize = 0;
4310 let mut ri: usize = 0;
4311
4312 let lo_v = vdupq_n_u8(lo);
4313 let hi_v = vdupq_n_u8(hi);
4314 let bit_mask: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
4316 let bit_mask_v = vld1q_u8(bit_mask.as_ptr());
4317
4318 while ri + 16 <= len {
4319 let input = vld1q_u8(data.as_ptr().add(ri));
4320 let ge_lo = vcgeq_u8(input, lo_v);
4322 let le_hi = vcleq_u8(input, hi_v);
4323 let in_range = vandq_u8(ge_lo, le_hi);
4324
4325 let bits = vandq_u8(in_range, bit_mask_v);
4327 let pair = vpaddlq_u8(bits); let quad = vpaddlq_u16(pair); let octet = vpaddlq_u32(quad); let mask_lo = vgetq_lane_u64::<0>(octet) as u8;
4331 let mask_hi = vgetq_lane_u64::<1>(octet) as u8;
4332 let del_mask = (mask_hi as u16) << 8 | mask_lo as u16;
4333
4334 if del_mask == 0 {
4335 ri += 16;
4337 continue;
4338 }
4339
4340 let mut m = del_mask;
4342 while m != 0 {
4343 let bit = m.trailing_zeros() as usize;
4344 let abs_pos = ri + bit;
4345 if abs_pos > run_start {
4346 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4347 if iov.len() >= MAX_IOV {
4348 write_ioslices(writer, &iov)?;
4349 iov.clear();
4350 }
4351 }
4352 run_start = abs_pos + 1;
4353 m &= m - 1;
4354 }
4355
4356 ri += 16;
4357 }
4358
4359 while ri < len {
4361 let b = *data.get_unchecked(ri);
4362 if b >= lo && b <= hi {
4363 if ri > run_start {
4364 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4365 if iov.len() >= MAX_IOV {
4366 write_ioslices(writer, &iov)?;
4367 iov.clear();
4368 }
4369 }
4370 run_start = ri + 1;
4371 }
4372 ri += 1;
4373 }
4374
4375 if run_start < len {
4376 iov.push(std::io::IoSlice::new(&data[run_start..]));
4377 }
4378 if !iov.is_empty() {
4379 write_ioslices(writer, &iov)?;
4380 }
4381 Ok(())
4382 }
4383}
4384
4385#[inline]
4388fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
4389 let len = chunk.len();
4390 let mut out_pos = 0;
4391 let mut i = 0;
4392
4393 while i + 8 <= len {
4394 unsafe {
4395 let b0 = *chunk.get_unchecked(i);
4396 let b1 = *chunk.get_unchecked(i + 1);
4397 let b2 = *chunk.get_unchecked(i + 2);
4398 let b3 = *chunk.get_unchecked(i + 3);
4399 let b4 = *chunk.get_unchecked(i + 4);
4400 let b5 = *chunk.get_unchecked(i + 5);
4401 let b6 = *chunk.get_unchecked(i + 6);
4402 let b7 = *chunk.get_unchecked(i + 7);
4403
4404 *outbuf.get_unchecked_mut(out_pos) = b0;
4405 out_pos += !is_member(member, b0) as usize;
4406 *outbuf.get_unchecked_mut(out_pos) = b1;
4407 out_pos += !is_member(member, b1) as usize;
4408 *outbuf.get_unchecked_mut(out_pos) = b2;
4409 out_pos += !is_member(member, b2) as usize;
4410 *outbuf.get_unchecked_mut(out_pos) = b3;
4411 out_pos += !is_member(member, b3) as usize;
4412 *outbuf.get_unchecked_mut(out_pos) = b4;
4413 out_pos += !is_member(member, b4) as usize;
4414 *outbuf.get_unchecked_mut(out_pos) = b5;
4415 out_pos += !is_member(member, b5) as usize;
4416 *outbuf.get_unchecked_mut(out_pos) = b6;
4417 out_pos += !is_member(member, b6) as usize;
4418 *outbuf.get_unchecked_mut(out_pos) = b7;
4419 out_pos += !is_member(member, b7) as usize;
4420 }
4421 i += 8;
4422 }
4423
4424 while i < len {
4425 unsafe {
4426 let b = *chunk.get_unchecked(i);
4427 *outbuf.get_unchecked_mut(out_pos) = b;
4428 out_pos += !is_member(member, b) as usize;
4429 }
4430 i += 1;
4431 }
4432
4433 out_pos
4434}
4435
4436fn delete_bitset_zerocopy(
4441 data: &[u8],
4442 member: &[u8; 32],
4443 writer: &mut impl Write,
4444) -> io::Result<()> {
4445 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4446 let len = data.len();
4447 let mut i = 0;
4448 let mut run_start: Option<usize> = None;
4449
4450 while i < len {
4451 let b = unsafe { *data.get_unchecked(i) };
4452 if is_member(member, b) {
4453 if let Some(rs) = run_start {
4455 iov.push(std::io::IoSlice::new(&data[rs..i]));
4456 run_start = None;
4457 if iov.len() >= MAX_IOV {
4458 write_ioslices(writer, &iov)?;
4459 iov.clear();
4460 }
4461 }
4462 } else {
4463 if run_start.is_none() {
4465 run_start = Some(i);
4466 }
4467 }
4468 i += 1;
4469 }
4470 if let Some(rs) = run_start {
4472 iov.push(std::io::IoSlice::new(&data[rs..]));
4473 }
4474 if !iov.is_empty() {
4475 write_ioslices(writer, &iov)?;
4476 }
4477 Ok(())
4478}
4479
4480fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4481 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4485 let mut last = 0;
4486 for pos in memchr::memchr_iter(ch, data) {
4487 if pos > last {
4488 iov.push(std::io::IoSlice::new(&data[last..pos]));
4489 if iov.len() >= MAX_IOV {
4490 write_ioslices(writer, &iov)?;
4491 iov.clear();
4492 }
4493 }
4494 last = pos + 1;
4495 }
4496 if last < data.len() {
4497 iov.push(std::io::IoSlice::new(&data[last..]));
4498 }
4499 if !iov.is_empty() {
4500 write_ioslices(writer, &iov)?;
4501 }
4502 Ok(())
4503}
4504
4505fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4506 let c0 = chars[0];
4507 let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
4508 let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
4509 let is_three = chars.len() >= 3;
4510
4511 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4513 let mut last = 0;
4514
4515 macro_rules! process_pos {
4516 ($pos:expr) => {
4517 if $pos > last {
4518 iov.push(std::io::IoSlice::new(&data[last..$pos]));
4519 if iov.len() >= MAX_IOV {
4520 write_ioslices(writer, &iov)?;
4521 iov.clear();
4522 }
4523 }
4524 last = $pos + 1;
4525 };
4526 }
4527
4528 if is_three {
4529 for pos in memchr::memchr3_iter(c0, c1, c2, data) {
4530 process_pos!(pos);
4531 }
4532 } else {
4533 for pos in memchr::memchr2_iter(c0, c1, data) {
4534 process_pos!(pos);
4535 }
4536 }
4537 if last < data.len() {
4538 iov.push(std::io::IoSlice::new(&data[last..]));
4539 }
4540 if !iov.is_empty() {
4541 write_ioslices(writer, &iov)?;
4542 }
4543 Ok(())
4544}
4545
4546pub fn delete_squeeze_mmap(
4551 delete_chars: &[u8],
4552 squeeze_chars: &[u8],
4553 data: &[u8],
4554 writer: &mut impl Write,
4555) -> io::Result<()> {
4556 let delete_set = build_member_set(delete_chars);
4557 let squeeze_set = build_member_set(squeeze_chars);
4558
4559 let mut outbuf = alloc_uninit_vec(data.len());
4561 let mut last_squeezed: u16 = 256;
4562 let mut out_pos = 0;
4563
4564 for &b in data.iter() {
4565 if is_member(&delete_set, b) {
4566 continue;
4567 }
4568 if is_member(&squeeze_set, b) {
4569 if last_squeezed == b as u16 {
4570 continue;
4571 }
4572 last_squeezed = b as u16;
4573 } else {
4574 last_squeezed = 256;
4575 }
4576 unsafe {
4577 *outbuf.get_unchecked_mut(out_pos) = b;
4578 }
4579 out_pos += 1;
4580 }
4581 writer.write_all(&outbuf[..out_pos])
4582}
4583
4584pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4590 if squeeze_chars.len() == 1 {
4591 return squeeze_single_mmap(squeeze_chars[0], data, writer);
4592 }
4593 if squeeze_chars.len() == 2 {
4594 return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
4595 }
4596 if squeeze_chars.len() == 3 {
4597 return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
4598 }
4599
4600 let member = build_member_set(squeeze_chars);
4601
4602 if data.len() >= PARALLEL_THRESHOLD {
4604 let n_threads = rayon::current_num_threads().max(1);
4605 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4606
4607 let results: Vec<Vec<u8>> = data
4608 .par_chunks(chunk_size)
4609 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
4610 .collect();
4611
4612 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
4617 for (idx, result) in results.iter().enumerate() {
4618 if result.is_empty() {
4619 continue;
4620 }
4621 if idx > 0 {
4622 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
4624 if is_member(&member, prev_last) {
4625 let skip = result.iter().take_while(|&&b| b == prev_last).count();
4627 if skip < result.len() {
4628 slices.push(std::io::IoSlice::new(&result[skip..]));
4629 }
4630 continue;
4631 }
4632 }
4633 }
4634 slices.push(std::io::IoSlice::new(result));
4635 }
4636 return write_ioslices(writer, &slices);
4637 }
4638
4639 let mut outbuf = alloc_uninit_vec(data.len());
4641 let len = data.len();
4642 let mut wp = 0;
4643 let mut i = 0;
4644 let mut last_squeezed: u16 = 256;
4645
4646 unsafe {
4647 let inp = data.as_ptr();
4648 let outp = outbuf.as_mut_ptr();
4649
4650 while i < len {
4651 let b = *inp.add(i);
4652 if is_member(&member, b) {
4653 if last_squeezed != b as u16 {
4654 *outp.add(wp) = b;
4655 wp += 1;
4656 last_squeezed = b as u16;
4657 }
4658 i += 1;
4659 while i < len && *inp.add(i) == b {
4660 i += 1;
4661 }
4662 } else {
4663 last_squeezed = 256;
4664 *outp.add(wp) = b;
4665 wp += 1;
4666 i += 1;
4667 }
4668 }
4669 }
4670 writer.write_all(&outbuf[..wp])
4671}
4672
4673fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
4675 let len = chunk.len();
4676 let mut out = Vec::with_capacity(len);
4677 let mut last_squeezed: u16 = 256;
4678 let mut i = 0;
4679
4680 unsafe {
4681 out.set_len(len);
4682 let inp = chunk.as_ptr();
4683 let outp: *mut u8 = out.as_mut_ptr();
4684 let mut wp = 0;
4685
4686 while i < len {
4687 let b = *inp.add(i);
4688 if is_member(member, b) {
4689 if last_squeezed != b as u16 {
4690 *outp.add(wp) = b;
4691 wp += 1;
4692 last_squeezed = b as u16;
4693 }
4694 i += 1;
4695 while i < len && *inp.add(i) == b {
4696 i += 1;
4697 }
4698 } else {
4699 last_squeezed = 256;
4700 *outp.add(wp) = b;
4701 wp += 1;
4702 i += 1;
4703 }
4704 }
4705 out.set_len(wp);
4706 }
4707 out
4708}
4709
4710fn squeeze_multi_mmap<const N: usize>(
4711 chars: &[u8],
4712 data: &[u8],
4713 writer: &mut impl Write,
4714) -> io::Result<()> {
4715 if data.len() >= PARALLEL_THRESHOLD {
4717 let member = build_member_set(chars);
4718 let n_threads = rayon::current_num_threads().max(1);
4719 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4720
4721 let results: Vec<Vec<u8>> = data
4722 .par_chunks(chunk_size)
4723 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
4724 .collect();
4725
4726 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
4728 for (idx, result) in results.iter().enumerate() {
4729 if result.is_empty() {
4730 continue;
4731 }
4732 if idx > 0 {
4733 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
4734 if is_member(&member, prev_last) {
4735 let skip = result.iter().take_while(|&&b| b == prev_last).count();
4736 if skip < result.len() {
4737 slices.push(std::io::IoSlice::new(&result[skip..]));
4738 }
4739 continue;
4740 }
4741 }
4742 }
4743 slices.push(std::io::IoSlice::new(result));
4744 }
4745 return write_ioslices(writer, &slices);
4746 }
4747
4748 let single = [chars[0]; 1]; let _ = single;
4754 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(1024);
4755 let mut cursor = 0;
4756 let mut last_squeezed: u16 = 256;
4757
4758 macro_rules! find_next {
4759 ($data:expr) => {
4760 if N == 2 {
4761 memchr::memchr2(chars[0], chars[1], $data)
4762 } else {
4763 memchr::memchr3(chars[0], chars[1], chars[2], $data)
4764 }
4765 };
4766 }
4767
4768 while cursor < data.len() {
4769 match find_next!(&data[cursor..]) {
4770 Some(offset) => {
4771 let pos = cursor + offset;
4772 let b = data[pos];
4773 if pos > cursor {
4775 iov.push(std::io::IoSlice::new(&data[cursor..pos]));
4776 last_squeezed = 256;
4777 }
4778 if last_squeezed != b as u16 {
4780 iov.push(std::io::IoSlice::new(&data[pos..pos + 1]));
4782 last_squeezed = b as u16;
4783 }
4784 let mut skip = pos + 1;
4786 while skip < data.len() && data[skip] == b {
4787 skip += 1;
4788 }
4789 cursor = skip;
4790 if iov.len() >= MAX_IOV {
4792 write_ioslices(writer, &iov)?;
4793 iov.clear();
4794 }
4795 }
4796 None => {
4797 if cursor < data.len() {
4798 iov.push(std::io::IoSlice::new(&data[cursor..]));
4799 }
4800 break;
4801 }
4802 }
4803 }
4804 if !iov.is_empty() {
4805 write_ioslices(writer, &iov)?;
4806 }
4807 Ok(())
4808}
4809
4810fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4811 if data.is_empty() {
4812 return Ok(());
4813 }
4814
4815 let pair = [ch, ch];
4817 if memchr::memmem::find(data, &pair).is_none() {
4818 return writer.write_all(data);
4819 }
4820
4821 let finder = memchr::memmem::Finder::new(&pair);
4828 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(2048);
4829 let mut cursor = 0;
4830
4831 while cursor < data.len() {
4832 match finder.find(&data[cursor..]) {
4833 Some(offset) => {
4834 let pair_pos = cursor + offset;
4835 let seg_end = pair_pos + 1;
4837 if seg_end > cursor {
4838 iov.push(std::io::IoSlice::new(&data[cursor..seg_end]));
4839 }
4840 let mut skip = seg_end;
4842 while skip < data.len() && data[skip] == ch {
4843 skip += 1;
4844 }
4845 cursor = skip;
4846 if iov.len() >= MAX_IOV {
4848 write_ioslices(writer, &iov)?;
4849 iov.clear();
4850 }
4851 }
4852 None => {
4853 if cursor < data.len() {
4855 iov.push(std::io::IoSlice::new(&data[cursor..]));
4856 }
4857 break;
4858 }
4859 }
4860 }
4861
4862 if !iov.is_empty() {
4863 write_ioslices(writer, &iov)?;
4864 }
4865 Ok(())
4866}