1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const MAX_IOV: usize = 1024;
8
9const STREAM_BUF: usize = 8 * 1024 * 1024;
15
16const PARALLEL_THRESHOLD: usize = 4 * 1024 * 1024;
23
24#[cfg(target_arch = "x86_64")]
30static COMPACT_LUT: [[u8; 8]; 256] = {
31 let mut lut = [[0u8; 8]; 256];
32 let mut mask: u16 = 0;
33 while mask < 256 {
34 let mut idx: usize = 0;
35 let mut bit: u8 = 0;
36 while bit < 8 {
37 if (mask >> bit) & 1 != 0 {
38 lut[mask as usize][idx] = bit;
39 idx += 1;
40 }
41 bit += 1;
42 }
43 mask += 1;
44 }
45 lut
46};
47
48#[inline]
51fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
52 if slices.is_empty() {
53 return Ok(());
54 }
55 for batch in slices.chunks(MAX_IOV) {
56 let total: usize = batch.iter().map(|s| s.len()).sum();
57 match writer.write_vectored(batch) {
58 Ok(n) if n >= total => continue,
59 Ok(mut written) => {
60 for slice in batch {
62 let slen = slice.len();
63 if written >= slen {
64 written -= slen;
65 continue;
66 }
67 if written > 0 {
68 writer.write_all(&slice[written..])?;
69 written = 0;
70 } else {
71 writer.write_all(slice)?;
72 }
73 }
74 }
75 Err(e) => return Err(e),
76 }
77 }
78 Ok(())
79}
80
81#[inline]
85#[allow(clippy::uninit_vec)]
86fn alloc_uninit_vec(len: usize) -> Vec<u8> {
87 let mut v = Vec::with_capacity(len);
88 unsafe {
90 v.set_len(len);
91 }
92 #[cfg(target_os = "linux")]
93 if len >= 2 * 1024 * 1024 {
94 unsafe {
95 libc::madvise(
96 v.as_mut_ptr() as *mut libc::c_void,
97 len,
98 libc::MADV_HUGEPAGE,
99 );
100 }
101 }
102 v
103}
104
105#[inline]
107fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
108 let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
109 let last = set2.last().copied();
110 for (i, &from) in set1.iter().enumerate() {
111 table[from as usize] = if i < set2.len() {
112 set2[i]
113 } else {
114 last.unwrap_or(from)
115 };
116 }
117 table
118}
119
120#[inline]
122fn build_member_set(chars: &[u8]) -> [u8; 32] {
123 let mut set = [0u8; 32];
124 for &ch in chars {
125 set[ch as usize >> 3] |= 1 << (ch & 7);
126 }
127 set
128}
129
130#[inline(always)]
131fn is_member(set: &[u8; 32], ch: u8) -> bool {
132 unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
133}
134
135#[cfg(target_arch = "x86_64")]
138static SIMD_LEVEL: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
139
140#[cfg(target_arch = "x86_64")]
141#[inline(always)]
142fn get_simd_level() -> u8 {
143 let level = SIMD_LEVEL.load(std::sync::atomic::Ordering::Relaxed);
144 if level != 0 {
145 return level;
146 }
147 let detected = if is_x86_feature_detected!("avx2") {
148 3
149 } else if is_x86_feature_detected!("ssse3") {
150 2
151 } else {
152 1
153 };
154 SIMD_LEVEL.store(detected, std::sync::atomic::Ordering::Relaxed);
155 detected
156}
157
158#[cfg(target_arch = "x86_64")]
160#[inline]
161fn count_non_identity(table: &[u8; 256]) -> usize {
162 table
163 .iter()
164 .enumerate()
165 .filter(|&(i, &v)| v != i as u8)
166 .count()
167}
168
169#[inline(always)]
175fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
176 #[cfg(target_arch = "x86_64")]
177 {
178 let level = get_simd_level();
179 if level >= 3 {
180 let non_id = count_non_identity(table);
185 if non_id > 0 && non_id <= 16 {
186 unsafe { translate_inplace_avx2_sparse(data, table) };
187 return;
188 }
189 unsafe { translate_inplace_avx2_table(data, table) };
190 return;
191 }
192 if level >= 2 {
193 unsafe { translate_inplace_ssse3_table(data, table) };
194 return;
195 }
196 }
197 translate_inplace_scalar(data, table);
198}
199
200#[cfg(target_arch = "x86_64")]
206#[target_feature(enable = "avx2")]
207unsafe fn translate_inplace_avx2_sparse(data: &mut [u8], table: &[u8; 256]) {
208 use std::arch::x86_64::*;
209
210 unsafe {
211 let len = data.len();
212 let ptr = data.as_mut_ptr();
213
214 let mut lut = [_mm256_setzero_si256(); 16];
216 for h in 0u8..16 {
217 let base = (h as usize) * 16;
218 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
219 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
220 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
221 }
222
223 let lo_mask = _mm256_set1_epi8(0x0F);
224
225 let mut i = 0;
226 while i + 32 <= len {
227 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
228 let lo_nibble = _mm256_and_si256(input, lo_mask);
229 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
230
231 let mut result = _mm256_setzero_si256();
232 macro_rules! do_nibble {
233 ($h:expr) => {
234 let h_val = _mm256_set1_epi8($h as i8);
235 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
236 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
237 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
238 };
239 }
240 do_nibble!(0);
241 do_nibble!(1);
242 do_nibble!(2);
243 do_nibble!(3);
244 do_nibble!(4);
245 do_nibble!(5);
246 do_nibble!(6);
247 do_nibble!(7);
248 do_nibble!(8);
249 do_nibble!(9);
250 do_nibble!(10);
251 do_nibble!(11);
252 do_nibble!(12);
253 do_nibble!(13);
254 do_nibble!(14);
255 do_nibble!(15);
256
257 let diff = _mm256_xor_si256(input, result);
259 if _mm256_testz_si256(diff, diff) == 0 {
260 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
261 }
262 i += 32;
263 }
264
265 while i < len {
267 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
268 i += 1;
269 }
270 }
271}
272
273#[cfg(not(target_arch = "aarch64"))]
275#[inline(always)]
276fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
277 let len = data.len();
278 let ptr = data.as_mut_ptr();
279 let mut i = 0;
280 unsafe {
281 while i + 8 <= len {
282 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
283 *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
284 *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
285 *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
286 *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
287 *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
288 *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
289 *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
290 i += 8;
291 }
292 while i < len {
293 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
294 i += 1;
295 }
296 }
297}
298
299#[cfg(target_arch = "aarch64")]
302#[inline(always)]
303fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
304 unsafe { translate_inplace_neon_table(data, table) };
305}
306
307#[cfg(target_arch = "aarch64")]
308#[target_feature(enable = "neon")]
309unsafe fn translate_inplace_neon_table(data: &mut [u8], table: &[u8; 256]) {
310 use std::arch::aarch64::*;
311
312 unsafe {
313 let len = data.len();
314 let ptr = data.as_mut_ptr();
315
316 let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
318 for h in 0u8..16 {
319 let base = (h as usize) * 16;
320 lut[h as usize] = vld1q_u8(table.as_ptr().add(base));
321 }
322
323 let lo_mask = vdupq_n_u8(0x0F);
324 let mut i = 0;
325
326 while i + 16 <= len {
327 let input = vld1q_u8(ptr.add(i));
328 let lo_nibble = vandq_u8(input, lo_mask);
329 let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
330
331 let mut result = vdupq_n_u8(0);
332 macro_rules! do_nibble {
333 ($h:expr) => {
334 let h_val = vdupq_n_u8($h);
335 let mask = vceqq_u8(hi_nibble, h_val);
336 let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
337 result = vorrq_u8(result, vandq_u8(mask, looked_up));
338 };
339 }
340 do_nibble!(0);
341 do_nibble!(1);
342 do_nibble!(2);
343 do_nibble!(3);
344 do_nibble!(4);
345 do_nibble!(5);
346 do_nibble!(6);
347 do_nibble!(7);
348 do_nibble!(8);
349 do_nibble!(9);
350 do_nibble!(10);
351 do_nibble!(11);
352 do_nibble!(12);
353 do_nibble!(13);
354 do_nibble!(14);
355 do_nibble!(15);
356
357 vst1q_u8(ptr.add(i), result);
358 i += 16;
359 }
360
361 while i < len {
363 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
364 i += 1;
365 }
366 }
367}
368
369#[cfg(target_arch = "x86_64")]
389#[target_feature(enable = "avx2")]
390unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
391 use std::arch::x86_64::*;
392
393 unsafe {
394 let len = data.len();
395 let ptr = data.as_mut_ptr();
396
397 let mut lut = [_mm256_setzero_si256(); 16];
401 for h in 0u8..16 {
402 let base = (h as usize) * 16;
403 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
404 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
406 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
407 }
408
409 let lo_mask = _mm256_set1_epi8(0x0F);
410
411 let mut i = 0;
412
413 while i + 64 <= len {
417 let input0 = _mm256_loadu_si256(ptr.add(i) as *const _);
418 let input1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
419
420 let lo0 = _mm256_and_si256(input0, lo_mask);
421 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
422 let lo1 = _mm256_and_si256(input1, lo_mask);
423 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
424
425 let mut r0 = _mm256_setzero_si256();
426 let mut r1 = _mm256_setzero_si256();
427
428 macro_rules! do_nibble2 {
429 ($h:expr) => {
430 let h_val = _mm256_set1_epi8($h as i8);
431 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
432 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
433 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
434 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
435 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
436 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
437 };
438 }
439 do_nibble2!(0);
440 do_nibble2!(1);
441 do_nibble2!(2);
442 do_nibble2!(3);
443 do_nibble2!(4);
444 do_nibble2!(5);
445 do_nibble2!(6);
446 do_nibble2!(7);
447 do_nibble2!(8);
448 do_nibble2!(9);
449 do_nibble2!(10);
450 do_nibble2!(11);
451 do_nibble2!(12);
452 do_nibble2!(13);
453 do_nibble2!(14);
454 do_nibble2!(15);
455
456 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
457 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
458 i += 64;
459 }
460
461 if i + 32 <= len {
463 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
464 let lo_nibble = _mm256_and_si256(input, lo_mask);
465 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
466
467 let mut result = _mm256_setzero_si256();
468
469 macro_rules! do_nibble {
470 ($h:expr) => {
471 let h_val = _mm256_set1_epi8($h as i8);
472 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
473 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
474 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
475 };
476 }
477 do_nibble!(0);
478 do_nibble!(1);
479 do_nibble!(2);
480 do_nibble!(3);
481 do_nibble!(4);
482 do_nibble!(5);
483 do_nibble!(6);
484 do_nibble!(7);
485 do_nibble!(8);
486 do_nibble!(9);
487 do_nibble!(10);
488 do_nibble!(11);
489 do_nibble!(12);
490 do_nibble!(13);
491 do_nibble!(14);
492 do_nibble!(15);
493
494 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
495 i += 32;
496 }
497
498 if i + 16 <= len {
500 let lo_mask128 = _mm_set1_epi8(0x0F);
501
502 let mut lut128 = [_mm_setzero_si128(); 16];
503 for h in 0u8..16 {
504 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
505 }
506
507 let input = _mm_loadu_si128(ptr.add(i) as *const _);
508 let lo_nib = _mm_and_si128(input, lo_mask128);
509 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
510
511 let mut res = _mm_setzero_si128();
512 macro_rules! do_nibble128 {
513 ($h:expr) => {
514 let h_val = _mm_set1_epi8($h as i8);
515 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
516 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
517 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
518 };
519 }
520 do_nibble128!(0);
521 do_nibble128!(1);
522 do_nibble128!(2);
523 do_nibble128!(3);
524 do_nibble128!(4);
525 do_nibble128!(5);
526 do_nibble128!(6);
527 do_nibble128!(7);
528 do_nibble128!(8);
529 do_nibble128!(9);
530 do_nibble128!(10);
531 do_nibble128!(11);
532 do_nibble128!(12);
533 do_nibble128!(13);
534 do_nibble128!(14);
535 do_nibble128!(15);
536
537 _mm_storeu_si128(ptr.add(i) as *mut _, res);
538 i += 16;
539 }
540
541 while i < len {
543 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
544 i += 1;
545 }
546 }
547}
548
549#[cfg(target_arch = "x86_64")]
550#[target_feature(enable = "ssse3")]
551unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
552 use std::arch::x86_64::*;
553
554 unsafe {
555 let len = data.len();
556 let ptr = data.as_mut_ptr();
557
558 let mut lut = [_mm_setzero_si128(); 16];
560 for h in 0u8..16 {
561 let base = (h as usize) * 16;
562 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
563 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
564 }
565
566 let lo_mask = _mm_set1_epi8(0x0F);
567
568 let mut i = 0;
569 while i + 16 <= len {
570 let input = _mm_loadu_si128(ptr.add(i) as *const _);
571 let lo_nibble = _mm_and_si128(input, lo_mask);
572 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
573
574 let mut result = _mm_setzero_si128();
575
576 macro_rules! do_nibble {
577 ($h:expr) => {
578 let h_val = _mm_set1_epi8($h as i8);
579 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
580 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
581 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
582 };
583 }
584 do_nibble!(0);
585 do_nibble!(1);
586 do_nibble!(2);
587 do_nibble!(3);
588 do_nibble!(4);
589 do_nibble!(5);
590 do_nibble!(6);
591 do_nibble!(7);
592 do_nibble!(8);
593 do_nibble!(9);
594 do_nibble!(10);
595 do_nibble!(11);
596 do_nibble!(12);
597 do_nibble!(13);
598 do_nibble!(14);
599 do_nibble!(15);
600
601 _mm_storeu_si128(ptr.add(i) as *mut _, result);
602 i += 16;
603 }
604
605 while i < len {
607 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
608 i += 1;
609 }
610 }
611}
612
613#[inline(always)]
616fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
617 debug_assert!(dst.len() >= src.len());
618 #[cfg(target_arch = "x86_64")]
619 {
620 let level = get_simd_level();
621 if level >= 3 {
622 if dst.as_ptr() as usize & 31 == 0 {
624 unsafe { translate_to_avx2_table_nt(src, dst, table) };
625 } else {
626 unsafe { translate_to_avx2_table(src, dst, table) };
627 }
628 return;
629 }
630 if level >= 2 {
631 unsafe { translate_to_ssse3_table(src, dst, table) };
632 return;
633 }
634 }
635 translate_to_scalar(src, dst, table);
636}
637
638#[cfg(not(target_arch = "aarch64"))]
640#[inline(always)]
641fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
642 unsafe {
643 let sp = src.as_ptr();
644 let dp = dst.as_mut_ptr();
645 let len = src.len();
646 let mut i = 0;
647 while i + 8 <= len {
648 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
649 *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
650 *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
651 *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
652 *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
653 *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
654 *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
655 *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
656 i += 8;
657 }
658 while i < len {
659 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
660 i += 1;
661 }
662 }
663}
664
665#[cfg(target_arch = "aarch64")]
667#[inline(always)]
668fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
669 unsafe { translate_to_neon_table(src, dst, table) };
670}
671
672#[cfg(target_arch = "aarch64")]
673#[target_feature(enable = "neon")]
674unsafe fn translate_to_neon_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
675 use std::arch::aarch64::*;
676
677 unsafe {
678 let len = src.len();
679 let sp = src.as_ptr();
680 let dp = dst.as_mut_ptr();
681
682 let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
683 for h in 0u8..16 {
684 lut[h as usize] = vld1q_u8(table.as_ptr().add((h as usize) * 16));
685 }
686
687 let lo_mask = vdupq_n_u8(0x0F);
688 let mut i = 0;
689
690 while i + 16 <= len {
691 let input = vld1q_u8(sp.add(i));
692 let lo_nibble = vandq_u8(input, lo_mask);
693 let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
694
695 let mut result = vdupq_n_u8(0);
696 macro_rules! do_nibble {
697 ($h:expr) => {
698 let h_val = vdupq_n_u8($h);
699 let mask = vceqq_u8(hi_nibble, h_val);
700 let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
701 result = vorrq_u8(result, vandq_u8(mask, looked_up));
702 };
703 }
704 do_nibble!(0);
705 do_nibble!(1);
706 do_nibble!(2);
707 do_nibble!(3);
708 do_nibble!(4);
709 do_nibble!(5);
710 do_nibble!(6);
711 do_nibble!(7);
712 do_nibble!(8);
713 do_nibble!(9);
714 do_nibble!(10);
715 do_nibble!(11);
716 do_nibble!(12);
717 do_nibble!(13);
718 do_nibble!(14);
719 do_nibble!(15);
720
721 vst1q_u8(dp.add(i), result);
722 i += 16;
723 }
724
725 while i < len {
726 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
727 i += 1;
728 }
729 }
730}
731
732#[cfg(target_arch = "x86_64")]
733#[target_feature(enable = "avx2")]
734unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
735 use std::arch::x86_64::*;
736
737 unsafe {
738 let len = src.len();
739 let sp = src.as_ptr();
740 let dp = dst.as_mut_ptr();
741
742 let mut lut = [_mm256_setzero_si256(); 16];
744 for h in 0u8..16 {
745 let base = (h as usize) * 16;
746 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
747 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
748 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
749 }
750
751 let lo_mask = _mm256_set1_epi8(0x0F);
752
753 let mut i = 0;
754
755 while i + 64 <= len {
757 let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
758 let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
759
760 let lo0 = _mm256_and_si256(input0, lo_mask);
761 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
762 let lo1 = _mm256_and_si256(input1, lo_mask);
763 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
764
765 let mut r0 = _mm256_setzero_si256();
766 let mut r1 = _mm256_setzero_si256();
767
768 macro_rules! do_nibble2 {
769 ($h:expr) => {
770 let h_val = _mm256_set1_epi8($h as i8);
771 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
772 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
773 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
774 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
775 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
776 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
777 };
778 }
779 do_nibble2!(0);
780 do_nibble2!(1);
781 do_nibble2!(2);
782 do_nibble2!(3);
783 do_nibble2!(4);
784 do_nibble2!(5);
785 do_nibble2!(6);
786 do_nibble2!(7);
787 do_nibble2!(8);
788 do_nibble2!(9);
789 do_nibble2!(10);
790 do_nibble2!(11);
791 do_nibble2!(12);
792 do_nibble2!(13);
793 do_nibble2!(14);
794 do_nibble2!(15);
795
796 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
797 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
798 i += 64;
799 }
800
801 if i + 32 <= len {
803 let input = _mm256_loadu_si256(sp.add(i) as *const _);
804 let lo_nibble = _mm256_and_si256(input, lo_mask);
805 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
806
807 let mut result = _mm256_setzero_si256();
808
809 macro_rules! do_nibble {
810 ($h:expr) => {
811 let h_val = _mm256_set1_epi8($h as i8);
812 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
813 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
814 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
815 };
816 }
817 do_nibble!(0);
818 do_nibble!(1);
819 do_nibble!(2);
820 do_nibble!(3);
821 do_nibble!(4);
822 do_nibble!(5);
823 do_nibble!(6);
824 do_nibble!(7);
825 do_nibble!(8);
826 do_nibble!(9);
827 do_nibble!(10);
828 do_nibble!(11);
829 do_nibble!(12);
830 do_nibble!(13);
831 do_nibble!(14);
832 do_nibble!(15);
833
834 _mm256_storeu_si256(dp.add(i) as *mut _, result);
835 i += 32;
836 }
837
838 if i + 16 <= len {
840 let lo_mask128 = _mm_set1_epi8(0x0F);
841 let mut lut128 = [_mm_setzero_si128(); 16];
842 for h in 0u8..16 {
843 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
844 }
845
846 let input = _mm_loadu_si128(sp.add(i) as *const _);
847 let lo_nib = _mm_and_si128(input, lo_mask128);
848 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
849
850 let mut res = _mm_setzero_si128();
851 macro_rules! do_nibble128 {
852 ($h:expr) => {
853 let h_val = _mm_set1_epi8($h as i8);
854 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
855 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
856 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
857 };
858 }
859 do_nibble128!(0);
860 do_nibble128!(1);
861 do_nibble128!(2);
862 do_nibble128!(3);
863 do_nibble128!(4);
864 do_nibble128!(5);
865 do_nibble128!(6);
866 do_nibble128!(7);
867 do_nibble128!(8);
868 do_nibble128!(9);
869 do_nibble128!(10);
870 do_nibble128!(11);
871 do_nibble128!(12);
872 do_nibble128!(13);
873 do_nibble128!(14);
874 do_nibble128!(15);
875
876 _mm_storeu_si128(dp.add(i) as *mut _, res);
877 i += 16;
878 }
879
880 while i < len {
882 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
883 i += 1;
884 }
885 }
886}
887
888#[cfg(target_arch = "x86_64")]
891#[target_feature(enable = "avx2")]
892unsafe fn translate_to_avx2_table_nt(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
893 use std::arch::x86_64::*;
894
895 unsafe {
896 let len = src.len();
897 let sp = src.as_ptr();
898 let dp = dst.as_mut_ptr();
899
900 let mut lut = [_mm256_setzero_si256(); 16];
902 for h in 0u8..16 {
903 let base = (h as usize) * 16;
904 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
905 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
906 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
907 }
908
909 let lo_mask = _mm256_set1_epi8(0x0F);
910 let mut i = 0;
911
912 while i + 64 <= len {
914 let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
915 let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
916
917 let lo0 = _mm256_and_si256(input0, lo_mask);
918 let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
919 let lo1 = _mm256_and_si256(input1, lo_mask);
920 let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
921
922 let mut r0 = _mm256_setzero_si256();
923 let mut r1 = _mm256_setzero_si256();
924
925 macro_rules! do_nibble2 {
926 ($h:expr) => {
927 let h_val = _mm256_set1_epi8($h as i8);
928 let m0 = _mm256_cmpeq_epi8(hi0, h_val);
929 let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
930 r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
931 let m1 = _mm256_cmpeq_epi8(hi1, h_val);
932 let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
933 r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
934 };
935 }
936 do_nibble2!(0);
937 do_nibble2!(1);
938 do_nibble2!(2);
939 do_nibble2!(3);
940 do_nibble2!(4);
941 do_nibble2!(5);
942 do_nibble2!(6);
943 do_nibble2!(7);
944 do_nibble2!(8);
945 do_nibble2!(9);
946 do_nibble2!(10);
947 do_nibble2!(11);
948 do_nibble2!(12);
949 do_nibble2!(13);
950 do_nibble2!(14);
951 do_nibble2!(15);
952
953 _mm256_stream_si256(dp.add(i) as *mut _, r0);
954 _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
955 i += 64;
956 }
957
958 if i + 32 <= len {
960 let input = _mm256_loadu_si256(sp.add(i) as *const _);
961 let lo_nibble = _mm256_and_si256(input, lo_mask);
962 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
963
964 let mut result = _mm256_setzero_si256();
965 macro_rules! do_nibble {
966 ($h:expr) => {
967 let h_val = _mm256_set1_epi8($h as i8);
968 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
969 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
970 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
971 };
972 }
973 do_nibble!(0);
974 do_nibble!(1);
975 do_nibble!(2);
976 do_nibble!(3);
977 do_nibble!(4);
978 do_nibble!(5);
979 do_nibble!(6);
980 do_nibble!(7);
981 do_nibble!(8);
982 do_nibble!(9);
983 do_nibble!(10);
984 do_nibble!(11);
985 do_nibble!(12);
986 do_nibble!(13);
987 do_nibble!(14);
988 do_nibble!(15);
989
990 _mm256_stream_si256(dp.add(i) as *mut _, result);
991 i += 32;
992 }
993
994 if i + 16 <= len {
996 let lo_mask128 = _mm_set1_epi8(0x0F);
997 let mut lut128 = [_mm_setzero_si128(); 16];
998 for h in 0u8..16 {
999 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
1000 }
1001
1002 let input = _mm_loadu_si128(sp.add(i) as *const _);
1003 let lo_nib = _mm_and_si128(input, lo_mask128);
1004 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
1005
1006 let mut res = _mm_setzero_si128();
1007 macro_rules! do_nibble128 {
1008 ($h:expr) => {
1009 let h_val = _mm_set1_epi8($h as i8);
1010 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
1011 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
1012 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
1013 };
1014 }
1015 do_nibble128!(0);
1016 do_nibble128!(1);
1017 do_nibble128!(2);
1018 do_nibble128!(3);
1019 do_nibble128!(4);
1020 do_nibble128!(5);
1021 do_nibble128!(6);
1022 do_nibble128!(7);
1023 do_nibble128!(8);
1024 do_nibble128!(9);
1025 do_nibble128!(10);
1026 do_nibble128!(11);
1027 do_nibble128!(12);
1028 do_nibble128!(13);
1029 do_nibble128!(14);
1030 do_nibble128!(15);
1031
1032 _mm_storeu_si128(dp.add(i) as *mut _, res);
1033 i += 16;
1034 }
1035
1036 while i < len {
1038 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1039 i += 1;
1040 }
1041
1042 _mm_sfence();
1044 }
1045}
1046
1047#[cfg(target_arch = "x86_64")]
1048#[target_feature(enable = "ssse3")]
1049unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
1050 use std::arch::x86_64::*;
1051
1052 unsafe {
1053 let len = src.len();
1054 let sp = src.as_ptr();
1055 let dp = dst.as_mut_ptr();
1056
1057 let mut lut = [_mm_setzero_si128(); 16];
1058 for h in 0u8..16 {
1059 let base = (h as usize) * 16;
1060 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
1061 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
1062 }
1063
1064 let lo_mask = _mm_set1_epi8(0x0F);
1065
1066 let mut i = 0;
1067 while i + 16 <= len {
1068 let input = _mm_loadu_si128(sp.add(i) as *const _);
1069 let lo_nibble = _mm_and_si128(input, lo_mask);
1070 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
1071
1072 let mut result = _mm_setzero_si128();
1073
1074 macro_rules! do_nibble {
1075 ($h:expr) => {
1076 let h_val = _mm_set1_epi8($h as i8);
1077 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
1078 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
1079 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
1080 };
1081 }
1082 do_nibble!(0);
1083 do_nibble!(1);
1084 do_nibble!(2);
1085 do_nibble!(3);
1086 do_nibble!(4);
1087 do_nibble!(5);
1088 do_nibble!(6);
1089 do_nibble!(7);
1090 do_nibble!(8);
1091 do_nibble!(9);
1092 do_nibble!(10);
1093 do_nibble!(11);
1094 do_nibble!(12);
1095 do_nibble!(13);
1096 do_nibble!(14);
1097 do_nibble!(15);
1098
1099 _mm_storeu_si128(dp.add(i) as *mut _, result);
1100 i += 16;
1101 }
1102
1103 while i < len {
1105 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1106 i += 1;
1107 }
1108 }
1109}
1110
1111#[inline]
1119fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
1120 let mut lo: Option<u8> = None;
1121 let mut hi = 0u8;
1122 let mut offset = 0i16;
1123
1124 for i in 0..256 {
1125 if table[i] != i as u8 {
1126 let diff = table[i] as i16 - i as i16;
1127 match lo {
1128 None => {
1129 lo = Some(i as u8);
1130 hi = i as u8;
1131 offset = diff;
1132 }
1133 Some(_) => {
1134 if diff != offset || i as u8 != hi.wrapping_add(1) {
1135 return None;
1136 }
1137 hi = i as u8;
1138 }
1139 }
1140 }
1141 }
1142
1143 lo.map(|l| (l, hi, offset as i8))
1144}
1145
1146#[inline]
1151fn detect_range_to_constant(table: &[u8; 256]) -> Option<(u8, u8, u8)> {
1152 let mut lo: Option<u8> = None;
1153 let mut hi = 0u8;
1154 let mut replacement = 0u8;
1155
1156 for i in 0..256 {
1157 if table[i] != i as u8 {
1158 match lo {
1159 None => {
1160 lo = Some(i as u8);
1161 hi = i as u8;
1162 replacement = table[i];
1163 }
1164 Some(_) => {
1165 if table[i] != replacement || i as u8 != hi.wrapping_add(1) {
1166 return None;
1167 }
1168 hi = i as u8;
1169 }
1170 }
1171 }
1172 }
1173
1174 lo.map(|l| (l, hi, replacement))
1175}
1176
1177#[cfg(target_arch = "x86_64")]
1182fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1183 if get_simd_level() >= 3 {
1184 unsafe { translate_range_to_constant_avx2_inplace(data, lo, hi, replacement) };
1185 } else {
1186 unsafe { translate_range_to_constant_sse2_inplace(data, lo, hi, replacement) };
1187 }
1188}
1189
1190#[cfg(target_arch = "x86_64")]
1191#[target_feature(enable = "avx2")]
1192unsafe fn translate_range_to_constant_avx2_inplace(
1193 data: &mut [u8],
1194 lo: u8,
1195 hi: u8,
1196 replacement: u8,
1197) {
1198 use std::arch::x86_64::*;
1199
1200 unsafe {
1201 let range = hi - lo;
1202 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1203 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1204 let repl_v = _mm256_set1_epi8(replacement as i8);
1205 let zero = _mm256_setzero_si256();
1206
1207 let len = data.len();
1208 let ptr = data.as_mut_ptr();
1209 let mut i = 0;
1210
1211 while i + 64 <= len {
1213 let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1214 let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1215 let bi0 = _mm256_add_epi8(in0, bias_v);
1216 let bi1 = _mm256_add_epi8(in1, bias_v);
1217 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1218 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1219 let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1220 let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1221 let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1222 let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1223 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1224 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1225 i += 64;
1226 }
1227
1228 if i + 32 <= len {
1230 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1231 let biased = _mm256_add_epi8(input, bias_v);
1232 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1233 let in_range = _mm256_cmpeq_epi8(gt, zero);
1234 let result = _mm256_blendv_epi8(input, repl_v, in_range);
1235 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1236 i += 32;
1237 }
1238
1239 if i + 16 <= len {
1240 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1241 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1242 let repl_v128 = _mm_set1_epi8(replacement as i8);
1243 let zero128 = _mm_setzero_si128();
1244
1245 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1246 let biased = _mm_add_epi8(input, bias_v128);
1247 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1248 let in_range = _mm_cmpeq_epi8(gt, zero128);
1249 let result = _mm_blendv_epi8(input, repl_v128, in_range);
1250 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1251 i += 16;
1252 }
1253
1254 while i < len {
1255 let b = *ptr.add(i);
1256 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1257 i += 1;
1258 }
1259 }
1260}
1261
1262#[cfg(target_arch = "x86_64")]
1263#[target_feature(enable = "sse2")]
1264unsafe fn translate_range_to_constant_sse2_inplace(
1265 data: &mut [u8],
1266 lo: u8,
1267 hi: u8,
1268 replacement: u8,
1269) {
1270 use std::arch::x86_64::*;
1271
1272 unsafe {
1273 let range = hi - lo;
1274 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1275 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1276 let repl_v = _mm_set1_epi8(replacement as i8);
1277 let zero = _mm_setzero_si128();
1278
1279 let len = data.len();
1280 let ptr = data.as_mut_ptr();
1281 let mut i = 0;
1282
1283 while i + 16 <= len {
1284 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1285 let biased = _mm_add_epi8(input, bias_v);
1286 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1287 let in_range = _mm_cmpeq_epi8(gt, zero);
1289 let result = _mm_or_si128(
1291 _mm_and_si128(in_range, repl_v),
1292 _mm_andnot_si128(in_range, input),
1293 );
1294 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1295 i += 16;
1296 }
1297
1298 while i < len {
1299 let b = *ptr.add(i);
1300 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1301 i += 1;
1302 }
1303 }
1304}
1305
1306#[cfg(target_arch = "aarch64")]
1307fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1308 unsafe { translate_range_to_constant_neon_inplace(data, lo, hi, replacement) };
1309}
1310
1311#[cfg(target_arch = "aarch64")]
1312#[target_feature(enable = "neon")]
1313unsafe fn translate_range_to_constant_neon_inplace(
1314 data: &mut [u8],
1315 lo: u8,
1316 hi: u8,
1317 replacement: u8,
1318) {
1319 use std::arch::aarch64::*;
1320
1321 unsafe {
1322 let len = data.len();
1323 let ptr = data.as_mut_ptr();
1324 let lo_v = vdupq_n_u8(lo);
1325 let hi_v = vdupq_n_u8(hi);
1326 let repl_v = vdupq_n_u8(replacement);
1327 let mut i = 0;
1328
1329 while i + 32 <= len {
1330 let in0 = vld1q_u8(ptr.add(i));
1331 let in1 = vld1q_u8(ptr.add(i + 16));
1332 let ge0 = vcgeq_u8(in0, lo_v);
1333 let le0 = vcleq_u8(in0, hi_v);
1334 let mask0 = vandq_u8(ge0, le0);
1335 let ge1 = vcgeq_u8(in1, lo_v);
1336 let le1 = vcleq_u8(in1, hi_v);
1337 let mask1 = vandq_u8(ge1, le1);
1338 vst1q_u8(ptr.add(i), vbslq_u8(mask0, repl_v, in0));
1340 vst1q_u8(ptr.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1341 i += 32;
1342 }
1343
1344 if i + 16 <= len {
1345 let input = vld1q_u8(ptr.add(i));
1346 let ge = vcgeq_u8(input, lo_v);
1347 let le = vcleq_u8(input, hi_v);
1348 let mask = vandq_u8(ge, le);
1349 vst1q_u8(ptr.add(i), vbslq_u8(mask, repl_v, input));
1350 i += 16;
1351 }
1352
1353 while i < len {
1354 let b = *ptr.add(i);
1355 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1356 i += 1;
1357 }
1358 }
1359}
1360
1361#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1362fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1363 for b in data.iter_mut() {
1364 if *b >= lo && *b <= hi {
1365 *b = replacement;
1366 }
1367 }
1368}
1369
1370#[cfg(target_arch = "x86_64")]
1373fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1374 if get_simd_level() >= 3 {
1375 unsafe { translate_range_to_constant_avx2(src, dst, lo, hi, replacement) };
1376 } else {
1377 unsafe { translate_range_to_constant_sse2(src, dst, lo, hi, replacement) };
1378 }
1379}
1380
1381#[cfg(target_arch = "aarch64")]
1382fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1383 unsafe { translate_range_to_constant_neon(src, dst, lo, hi, replacement) };
1384}
1385
1386#[cfg(target_arch = "aarch64")]
1387#[target_feature(enable = "neon")]
1388unsafe fn translate_range_to_constant_neon(
1389 src: &[u8],
1390 dst: &mut [u8],
1391 lo: u8,
1392 hi: u8,
1393 replacement: u8,
1394) {
1395 use std::arch::aarch64::*;
1396
1397 unsafe {
1398 let len = src.len();
1399 let sp = src.as_ptr();
1400 let dp = dst.as_mut_ptr();
1401 let lo_v = vdupq_n_u8(lo);
1402 let hi_v = vdupq_n_u8(hi);
1403 let repl_v = vdupq_n_u8(replacement);
1404 let mut i = 0;
1405
1406 while i + 32 <= len {
1407 let in0 = vld1q_u8(sp.add(i));
1408 let in1 = vld1q_u8(sp.add(i + 16));
1409 let mask0 = vandq_u8(vcgeq_u8(in0, lo_v), vcleq_u8(in0, hi_v));
1410 let mask1 = vandq_u8(vcgeq_u8(in1, lo_v), vcleq_u8(in1, hi_v));
1411 vst1q_u8(dp.add(i), vbslq_u8(mask0, repl_v, in0));
1412 vst1q_u8(dp.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1413 i += 32;
1414 }
1415
1416 if i + 16 <= len {
1417 let input = vld1q_u8(sp.add(i));
1418 let mask = vandq_u8(vcgeq_u8(input, lo_v), vcleq_u8(input, hi_v));
1419 vst1q_u8(dp.add(i), vbslq_u8(mask, repl_v, input));
1420 i += 16;
1421 }
1422
1423 while i < len {
1424 let b = *sp.add(i);
1425 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1426 i += 1;
1427 }
1428 }
1429}
1430
1431#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1432fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1433 for (i, &b) in src.iter().enumerate() {
1434 unsafe {
1435 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi { replacement } else { b };
1436 }
1437 }
1438}
1439
1440#[cfg(target_arch = "x86_64")]
1441#[target_feature(enable = "avx2")]
1442unsafe fn translate_range_to_constant_avx2(
1443 src: &[u8],
1444 dst: &mut [u8],
1445 lo: u8,
1446 hi: u8,
1447 replacement: u8,
1448) {
1449 use std::arch::x86_64::*;
1450 unsafe {
1451 let range = hi - lo;
1452 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1453 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1454 let repl_v = _mm256_set1_epi8(replacement as i8);
1455 let zero = _mm256_setzero_si256();
1456 let len = src.len();
1457 let sp = src.as_ptr();
1458 let dp = dst.as_mut_ptr();
1459 let mut i = 0;
1460 while i + 64 <= len {
1461 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1462 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1463 let bi0 = _mm256_add_epi8(in0, bias_v);
1464 let bi1 = _mm256_add_epi8(in1, bias_v);
1465 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1466 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1467 let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1468 let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1469 let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1470 let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1471 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1472 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1473 i += 64;
1474 }
1475 if i + 32 <= len {
1476 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1477 let biased = _mm256_add_epi8(input, bias_v);
1478 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1479 let in_range = _mm256_cmpeq_epi8(gt, zero);
1480 let result = _mm256_blendv_epi8(input, repl_v, in_range);
1481 _mm256_storeu_si256(dp.add(i) as *mut _, result);
1482 i += 32;
1483 }
1484 while i < len {
1485 let b = *sp.add(i);
1486 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1487 i += 1;
1488 }
1489 }
1490}
1491
1492#[cfg(target_arch = "x86_64")]
1493#[target_feature(enable = "sse2")]
1494unsafe fn translate_range_to_constant_sse2(
1495 src: &[u8],
1496 dst: &mut [u8],
1497 lo: u8,
1498 hi: u8,
1499 replacement: u8,
1500) {
1501 use std::arch::x86_64::*;
1502 unsafe {
1503 let range = hi - lo;
1504 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1505 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1506 let repl_v = _mm_set1_epi8(replacement as i8);
1507 let zero = _mm_setzero_si128();
1508 let len = src.len();
1509 let sp = src.as_ptr();
1510 let dp = dst.as_mut_ptr();
1511 let mut i = 0;
1512 while i + 16 <= len {
1513 let input = _mm_loadu_si128(sp.add(i) as *const _);
1514 let biased = _mm_add_epi8(input, bias_v);
1515 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1516 let in_range = _mm_cmpeq_epi8(gt, zero);
1517 let result = _mm_or_si128(
1518 _mm_and_si128(in_range, repl_v),
1519 _mm_andnot_si128(in_range, input),
1520 );
1521 _mm_storeu_si128(dp.add(i) as *mut _, result);
1522 i += 16;
1523 }
1524 while i < len {
1525 let b = *sp.add(i);
1526 *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1527 i += 1;
1528 }
1529 }
1530}
1531
1532#[cfg(target_arch = "x86_64")]
1539fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1540 if get_simd_level() >= 3 {
1541 if dst.as_ptr() as usize & 31 == 0 {
1543 unsafe { translate_range_avx2_nt(src, dst, lo, hi, offset) };
1544 } else {
1545 unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
1546 }
1547 } else {
1548 unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
1549 }
1550}
1551
1552#[cfg(target_arch = "x86_64")]
1553#[target_feature(enable = "avx2")]
1554unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1555 use std::arch::x86_64::*;
1556
1557 unsafe {
1558 let range = hi - lo;
1559 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1564 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1565 let offset_v = _mm256_set1_epi8(offset);
1566 let zero = _mm256_setzero_si256();
1567
1568 let len = src.len();
1569 let sp = src.as_ptr();
1570 let dp = dst.as_mut_ptr();
1571 let mut i = 0;
1572
1573 while i + 64 <= len {
1576 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1577 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1578 let bi0 = _mm256_add_epi8(in0, bias_v);
1579 let bi1 = _mm256_add_epi8(in1, bias_v);
1580 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1581 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1582 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1583 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1584 let om0 = _mm256_and_si256(m0, offset_v);
1585 let om1 = _mm256_and_si256(m1, offset_v);
1586 let r0 = _mm256_add_epi8(in0, om0);
1587 let r1 = _mm256_add_epi8(in1, om1);
1588 _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1589 _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1590 i += 64;
1591 }
1592
1593 if i + 32 <= len {
1595 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1596 let biased = _mm256_add_epi8(input, bias_v);
1597 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1598 let mask = _mm256_cmpeq_epi8(gt, zero);
1599 let offset_masked = _mm256_and_si256(mask, offset_v);
1600 let result = _mm256_add_epi8(input, offset_masked);
1601 _mm256_storeu_si256(dp.add(i) as *mut _, result);
1602 i += 32;
1603 }
1604
1605 if i + 16 <= len {
1607 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1608 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1609 let offset_v128 = _mm_set1_epi8(offset);
1610 let zero128 = _mm_setzero_si128();
1611
1612 let input = _mm_loadu_si128(sp.add(i) as *const _);
1613 let biased = _mm_add_epi8(input, bias_v128);
1614 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1615 let mask = _mm_cmpeq_epi8(gt, zero128);
1616 let offset_masked = _mm_and_si128(mask, offset_v128);
1617 let result = _mm_add_epi8(input, offset_masked);
1618 _mm_storeu_si128(dp.add(i) as *mut _, result);
1619 i += 16;
1620 }
1621
1622 while i < len {
1624 let b = *sp.add(i);
1625 *dp.add(i) = if b >= lo && b <= hi {
1626 b.wrapping_add(offset as u8)
1627 } else {
1628 b
1629 };
1630 i += 1;
1631 }
1632 }
1633}
1634
1635#[cfg(target_arch = "x86_64")]
1641#[target_feature(enable = "avx2")]
1642unsafe fn translate_range_avx2_nt(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1643 use std::arch::x86_64::*;
1644
1645 unsafe {
1646 let range = hi - lo;
1647 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1648 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1649 let offset_v = _mm256_set1_epi8(offset);
1650 let zero = _mm256_setzero_si256();
1651
1652 let len = src.len();
1653 let sp = src.as_ptr();
1654 let dp = dst.as_mut_ptr();
1655 let mut i = 0;
1656
1657 while i + 64 <= len {
1659 let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1660 let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1661 let bi0 = _mm256_add_epi8(in0, bias_v);
1662 let bi1 = _mm256_add_epi8(in1, bias_v);
1663 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1664 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1665 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1666 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1667 let om0 = _mm256_and_si256(m0, offset_v);
1668 let om1 = _mm256_and_si256(m1, offset_v);
1669 let r0 = _mm256_add_epi8(in0, om0);
1670 let r1 = _mm256_add_epi8(in1, om1);
1671 _mm256_stream_si256(dp.add(i) as *mut _, r0);
1672 _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
1673 i += 64;
1674 }
1675
1676 if i + 32 <= len {
1678 let input = _mm256_loadu_si256(sp.add(i) as *const _);
1679 let biased = _mm256_add_epi8(input, bias_v);
1680 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1681 let mask = _mm256_cmpeq_epi8(gt, zero);
1682 let offset_masked = _mm256_and_si256(mask, offset_v);
1683 let result = _mm256_add_epi8(input, offset_masked);
1684 _mm256_stream_si256(dp.add(i) as *mut _, result);
1685 i += 32;
1686 }
1687
1688 if i + 16 <= len {
1690 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1691 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1692 let offset_v128 = _mm_set1_epi8(offset);
1693 let zero128 = _mm_setzero_si128();
1694
1695 let input = _mm_loadu_si128(sp.add(i) as *const _);
1696 let biased = _mm_add_epi8(input, bias_v128);
1697 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1698 let mask = _mm_cmpeq_epi8(gt, zero128);
1699 let offset_masked = _mm_and_si128(mask, offset_v128);
1700 let result = _mm_add_epi8(input, offset_masked);
1701 _mm_storeu_si128(dp.add(i) as *mut _, result);
1702 i += 16;
1703 }
1704
1705 while i < len {
1707 let b = *sp.add(i);
1708 *dp.add(i) = if b >= lo && b <= hi {
1709 b.wrapping_add(offset as u8)
1710 } else {
1711 b
1712 };
1713 i += 1;
1714 }
1715
1716 _mm_sfence();
1718 }
1719}
1720
1721#[cfg(target_arch = "x86_64")]
1722#[target_feature(enable = "sse2")]
1723unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1724 use std::arch::x86_64::*;
1725
1726 unsafe {
1727 let range = hi - lo;
1728 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1729 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1730 let offset_v = _mm_set1_epi8(offset);
1731 let zero = _mm_setzero_si128();
1732
1733 let len = src.len();
1734 let mut i = 0;
1735
1736 while i + 16 <= len {
1737 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
1738 let biased = _mm_add_epi8(input, bias_v);
1739 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1740 let mask = _mm_cmpeq_epi8(gt, zero);
1741 let offset_masked = _mm_and_si128(mask, offset_v);
1742 let result = _mm_add_epi8(input, offset_masked);
1743 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
1744 i += 16;
1745 }
1746
1747 while i < len {
1748 let b = *src.get_unchecked(i);
1749 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
1750 b.wrapping_add(offset as u8)
1751 } else {
1752 b
1753 };
1754 i += 1;
1755 }
1756 }
1757}
1758
1759#[cfg(target_arch = "aarch64")]
1762fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1763 unsafe { translate_range_neon(src, dst, lo, hi, offset) };
1764}
1765
1766#[cfg(target_arch = "aarch64")]
1767#[target_feature(enable = "neon")]
1768unsafe fn translate_range_neon(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1769 use std::arch::aarch64::*;
1770
1771 unsafe {
1772 let len = src.len();
1773 let sp = src.as_ptr();
1774 let dp = dst.as_mut_ptr();
1775 let lo_v = vdupq_n_u8(lo);
1776 let hi_v = vdupq_n_u8(hi);
1777 let offset_v = vdupq_n_s8(offset);
1778 let mut i = 0;
1779
1780 while i + 32 <= len {
1782 let in0 = vld1q_u8(sp.add(i));
1783 let in1 = vld1q_u8(sp.add(i + 16));
1784 let ge0 = vcgeq_u8(in0, lo_v);
1786 let le0 = vcleq_u8(in0, hi_v);
1787 let mask0 = vandq_u8(ge0, le0);
1788 let ge1 = vcgeq_u8(in1, lo_v);
1789 let le1 = vcleq_u8(in1, hi_v);
1790 let mask1 = vandq_u8(ge1, le1);
1791 let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
1793 let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
1794 let r0 = vaddq_u8(in0, off0);
1795 let r1 = vaddq_u8(in1, off1);
1796 vst1q_u8(dp.add(i), r0);
1797 vst1q_u8(dp.add(i + 16), r1);
1798 i += 32;
1799 }
1800
1801 if i + 16 <= len {
1802 let input = vld1q_u8(sp.add(i));
1803 let ge = vcgeq_u8(input, lo_v);
1804 let le = vcleq_u8(input, hi_v);
1805 let mask = vandq_u8(ge, le);
1806 let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
1807 vst1q_u8(dp.add(i), vaddq_u8(input, off));
1808 i += 16;
1809 }
1810
1811 while i < len {
1812 let b = *sp.add(i);
1813 *dp.add(i) = if b >= lo && b <= hi {
1814 b.wrapping_add(offset as u8)
1815 } else {
1816 b
1817 };
1818 i += 1;
1819 }
1820 }
1821}
1822
1823#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1825fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1826 let offset_u8 = offset as u8;
1827 let range = hi.wrapping_sub(lo);
1828 unsafe {
1829 let sp = src.as_ptr();
1830 let dp = dst.as_mut_ptr();
1831 let len = src.len();
1832 let mut i = 0;
1833 while i + 8 <= len {
1834 macro_rules! do_byte {
1835 ($off:expr) => {{
1836 let b = *sp.add(i + $off);
1837 let in_range = b.wrapping_sub(lo) <= range;
1838 *dp.add(i + $off) = if in_range {
1839 b.wrapping_add(offset_u8)
1840 } else {
1841 b
1842 };
1843 }};
1844 }
1845 do_byte!(0);
1846 do_byte!(1);
1847 do_byte!(2);
1848 do_byte!(3);
1849 do_byte!(4);
1850 do_byte!(5);
1851 do_byte!(6);
1852 do_byte!(7);
1853 i += 8;
1854 }
1855 while i < len {
1856 let b = *sp.add(i);
1857 let in_range = b.wrapping_sub(lo) <= range;
1858 *dp.add(i) = if in_range {
1859 b.wrapping_add(offset_u8)
1860 } else {
1861 b
1862 };
1863 i += 1;
1864 }
1865 }
1866}
1867
1868#[cfg(target_arch = "x86_64")]
1876fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1877 if get_simd_level() >= 3 {
1878 unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
1879 } else {
1880 unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
1881 }
1882}
1883
1884#[cfg(target_arch = "x86_64")]
1885#[target_feature(enable = "avx2")]
1886unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1887 use std::arch::x86_64::*;
1888
1889 unsafe {
1890 let range = hi - lo;
1891 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1892 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1893 let offset_v = _mm256_set1_epi8(offset);
1894 let zero = _mm256_setzero_si256();
1895
1896 let len = data.len();
1897 let ptr = data.as_mut_ptr();
1898 let mut i = 0;
1899
1900 while i + 64 <= len {
1902 let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1903 let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1904 let bi0 = _mm256_add_epi8(in0, bias_v);
1905 let bi1 = _mm256_add_epi8(in1, bias_v);
1906 let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1907 let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1908 let m0 = _mm256_cmpeq_epi8(gt0, zero);
1909 let m1 = _mm256_cmpeq_epi8(gt1, zero);
1910 let om0 = _mm256_and_si256(m0, offset_v);
1911 let om1 = _mm256_and_si256(m1, offset_v);
1912 let r0 = _mm256_add_epi8(in0, om0);
1913 let r1 = _mm256_add_epi8(in1, om1);
1914 _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1915 _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1916 i += 64;
1917 }
1918
1919 if i + 32 <= len {
1921 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1922 let biased = _mm256_add_epi8(input, bias_v);
1923 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1924 let mask = _mm256_cmpeq_epi8(gt, zero);
1925 let offset_masked = _mm256_and_si256(mask, offset_v);
1926 let result = _mm256_add_epi8(input, offset_masked);
1927 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1928 i += 32;
1929 }
1930
1931 if i + 16 <= len {
1932 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1933 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1934 let offset_v128 = _mm_set1_epi8(offset);
1935 let zero128 = _mm_setzero_si128();
1936
1937 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1938 let biased = _mm_add_epi8(input, bias_v128);
1939 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1940 let mask = _mm_cmpeq_epi8(gt, zero128);
1941 let offset_masked = _mm_and_si128(mask, offset_v128);
1942 let result = _mm_add_epi8(input, offset_masked);
1943 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1944 i += 16;
1945 }
1946
1947 while i < len {
1948 let b = *ptr.add(i);
1949 *ptr.add(i) = if b >= lo && b <= hi {
1950 b.wrapping_add(offset as u8)
1951 } else {
1952 b
1953 };
1954 i += 1;
1955 }
1956 }
1957}
1958
1959#[cfg(target_arch = "x86_64")]
1960#[target_feature(enable = "sse2")]
1961unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1962 use std::arch::x86_64::*;
1963
1964 unsafe {
1965 let range = hi - lo;
1966 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1967 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1968 let offset_v = _mm_set1_epi8(offset);
1969 let zero = _mm_setzero_si128();
1970
1971 let len = data.len();
1972 let ptr = data.as_mut_ptr();
1973 let mut i = 0;
1974
1975 while i + 16 <= len {
1976 let input = _mm_loadu_si128(ptr.add(i) as *const _);
1977 let biased = _mm_add_epi8(input, bias_v);
1978 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1979 let mask = _mm_cmpeq_epi8(gt, zero);
1980 let offset_masked = _mm_and_si128(mask, offset_v);
1981 let result = _mm_add_epi8(input, offset_masked);
1982 _mm_storeu_si128(ptr.add(i) as *mut _, result);
1983 i += 16;
1984 }
1985
1986 while i < len {
1987 let b = *ptr.add(i);
1988 *ptr.add(i) = if b >= lo && b <= hi {
1989 b.wrapping_add(offset as u8)
1990 } else {
1991 b
1992 };
1993 i += 1;
1994 }
1995 }
1996}
1997
1998#[cfg(target_arch = "aarch64")]
1999fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2000 unsafe { translate_range_neon_inplace(data, lo, hi, offset) };
2001}
2002
2003#[cfg(target_arch = "aarch64")]
2004#[target_feature(enable = "neon")]
2005unsafe fn translate_range_neon_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2006 use std::arch::aarch64::*;
2007
2008 unsafe {
2009 let len = data.len();
2010 let ptr = data.as_mut_ptr();
2011 let lo_v = vdupq_n_u8(lo);
2012 let hi_v = vdupq_n_u8(hi);
2013 let offset_v = vdupq_n_s8(offset);
2014 let mut i = 0;
2015
2016 while i + 32 <= len {
2017 let in0 = vld1q_u8(ptr.add(i));
2018 let in1 = vld1q_u8(ptr.add(i + 16));
2019 let ge0 = vcgeq_u8(in0, lo_v);
2020 let le0 = vcleq_u8(in0, hi_v);
2021 let mask0 = vandq_u8(ge0, le0);
2022 let ge1 = vcgeq_u8(in1, lo_v);
2023 let le1 = vcleq_u8(in1, hi_v);
2024 let mask1 = vandq_u8(ge1, le1);
2025 let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
2026 let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
2027 vst1q_u8(ptr.add(i), vaddq_u8(in0, off0));
2028 vst1q_u8(ptr.add(i + 16), vaddq_u8(in1, off1));
2029 i += 32;
2030 }
2031
2032 if i + 16 <= len {
2033 let input = vld1q_u8(ptr.add(i));
2034 let ge = vcgeq_u8(input, lo_v);
2035 let le = vcleq_u8(input, hi_v);
2036 let mask = vandq_u8(ge, le);
2037 let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
2038 vst1q_u8(ptr.add(i), vaddq_u8(input, off));
2039 i += 16;
2040 }
2041
2042 while i < len {
2043 let b = *ptr.add(i);
2044 if b >= lo && b <= hi {
2045 *ptr.add(i) = b.wrapping_add(offset as u8);
2046 }
2047 i += 1;
2048 }
2049 }
2050}
2051
2052#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
2053fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2054 let offset_u8 = offset as u8;
2055 let range = hi.wrapping_sub(lo);
2056 for b in data.iter_mut() {
2057 if b.wrapping_sub(lo) <= range {
2058 *b = b.wrapping_add(offset_u8);
2059 }
2060 }
2061}
2062
2063#[inline]
2073fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
2074 if chars.is_empty() {
2075 return None;
2076 }
2077 let mut lo = chars[0];
2078 let mut hi = chars[0];
2079 for &c in &chars[1..] {
2080 if c < lo {
2081 lo = c;
2082 }
2083 if c > hi {
2084 hi = c;
2085 }
2086 }
2087 if (hi as usize - lo as usize + 1) == chars.len() {
2090 Some((lo, hi))
2091 } else {
2092 None
2093 }
2094}
2095
2096#[cfg(target_arch = "x86_64")]
2100fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2101 if get_simd_level() >= 3 {
2102 unsafe { delete_range_avx2(src, dst, lo, hi) }
2103 } else {
2104 unsafe { delete_range_sse2(src, dst, lo, hi) }
2105 }
2106}
2107
2108#[cfg(target_arch = "x86_64")]
2109#[target_feature(enable = "avx2")]
2110unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2111 use std::arch::x86_64::*;
2112
2113 unsafe {
2114 let range = hi - lo;
2115 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2116 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2117 let zero = _mm256_setzero_si256();
2118
2119 let len = src.len();
2120 let sp = src.as_ptr();
2121 let dp = dst.as_mut_ptr();
2122 let mut ri = 0;
2123 let mut wp = 0;
2124
2125 while ri + 32 <= len {
2126 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
2127 let biased = _mm256_add_epi8(input, bias_v);
2128 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2130 let in_range = _mm256_cmpeq_epi8(gt, zero);
2132 let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
2134
2135 if keep_mask == 0xFFFFFFFF {
2136 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
2138 wp += 32;
2139 } else if keep_mask != 0 {
2140 let m0 = keep_mask as u8;
2145 let m1 = (keep_mask >> 8) as u8;
2146 let m2 = (keep_mask >> 16) as u8;
2147 let m3 = (keep_mask >> 24) as u8;
2148
2149 if m0 == 0xFF {
2150 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2151 } else if m0 != 0 {
2152 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2153 }
2154 let c0 = m0.count_ones() as usize;
2155
2156 if m1 == 0xFF {
2157 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2158 } else if m1 != 0 {
2159 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2160 }
2161 let c1 = m1.count_ones() as usize;
2162
2163 if m2 == 0xFF {
2164 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
2165 } else if m2 != 0 {
2166 compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
2167 }
2168 let c2 = m2.count_ones() as usize;
2169
2170 if m3 == 0xFF {
2171 std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
2172 } else if m3 != 0 {
2173 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
2174 }
2175 let c3 = m3.count_ones() as usize;
2176 wp += c0 + c1 + c2 + c3;
2177 }
2178 ri += 32;
2180 }
2181
2182 if ri + 16 <= len {
2184 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2185 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2186 let zero128 = _mm_setzero_si128();
2187
2188 let input = _mm_loadu_si128(sp.add(ri) as *const _);
2189 let biased = _mm_add_epi8(input, bias_v128);
2190 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
2191 let in_range = _mm_cmpeq_epi8(gt, zero128);
2192 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2193
2194 if keep_mask == 0xFFFF {
2195 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2196 wp += 16;
2197 } else if keep_mask != 0 {
2198 let m0 = keep_mask as u8;
2199 let m1 = (keep_mask >> 8) as u8;
2200 if m0 == 0xFF {
2201 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2202 } else if m0 != 0 {
2203 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2204 }
2205 let c0 = m0.count_ones() as usize;
2206 if m1 == 0xFF {
2207 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2208 } else if m1 != 0 {
2209 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2210 }
2211 wp += c0 + m1.count_ones() as usize;
2212 }
2213 ri += 16;
2214 }
2215
2216 while ri < len {
2218 let b = *sp.add(ri);
2219 *dp.add(wp) = b;
2220 wp += (b < lo || b > hi) as usize;
2221 ri += 1;
2222 }
2223
2224 wp
2225 }
2226}
2227
2228#[cfg(target_arch = "x86_64")]
2236#[inline(always)]
2237unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
2238 unsafe {
2239 let idx = COMPACT_LUT.get_unchecked(mask as usize);
2240 *dst = *src.add(*idx.get_unchecked(0) as usize);
2241 *dst.add(1) = *src.add(*idx.get_unchecked(1) as usize);
2242 *dst.add(2) = *src.add(*idx.get_unchecked(2) as usize);
2243 *dst.add(3) = *src.add(*idx.get_unchecked(3) as usize);
2244 *dst.add(4) = *src.add(*idx.get_unchecked(4) as usize);
2245 *dst.add(5) = *src.add(*idx.get_unchecked(5) as usize);
2246 *dst.add(6) = *src.add(*idx.get_unchecked(6) as usize);
2247 *dst.add(7) = *src.add(*idx.get_unchecked(7) as usize);
2248 }
2249}
2250
2251#[cfg(target_arch = "x86_64")]
2256#[target_feature(enable = "ssse3")]
2257#[inline]
2258unsafe fn compact_8bytes_simd(src: *const u8, dst: *mut u8, mask: u8) {
2259 use std::arch::x86_64::*;
2260 unsafe {
2261 let src_v = _mm_loadl_epi64(src as *const _);
2262 let shuf = _mm_loadl_epi64(COMPACT_LUT.get_unchecked(mask as usize).as_ptr() as *const _);
2263 let out_v = _mm_shuffle_epi8(src_v, shuf);
2264 _mm_storel_epi64(dst as *mut _, out_v);
2265 }
2266}
2267
2268#[cfg(target_arch = "x86_64")]
2269#[target_feature(enable = "sse2")]
2270unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2271 use std::arch::x86_64::*;
2272
2273 unsafe {
2274 let range = hi - lo;
2275 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2276 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2277 let zero = _mm_setzero_si128();
2278
2279 let len = src.len();
2280 let sp = src.as_ptr();
2281 let dp = dst.as_mut_ptr();
2282 let mut ri = 0;
2283 let mut wp = 0;
2284
2285 while ri + 16 <= len {
2286 let input = _mm_loadu_si128(sp.add(ri) as *const _);
2287 let biased = _mm_add_epi8(input, bias_v);
2288 let gt = _mm_cmpgt_epi8(biased, threshold_v);
2289 let in_range = _mm_cmpeq_epi8(gt, zero);
2290 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2291
2292 if keep_mask == 0xFFFF {
2293 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2295 wp += 16;
2296 } else if keep_mask != 0 {
2297 let m0 = keep_mask as u8;
2298 let m1 = (keep_mask >> 8) as u8;
2299 if m0 == 0xFF {
2300 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2301 } else if m0 != 0 {
2302 compact_8bytes(sp.add(ri), dp.add(wp), m0);
2303 }
2304 let c0 = m0.count_ones() as usize;
2305 if m1 == 0xFF {
2306 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2307 } else if m1 != 0 {
2308 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2309 }
2310 wp += c0 + m1.count_ones() as usize;
2311 }
2312 ri += 16;
2313 }
2314
2315 while ri < len {
2317 let b = *sp.add(ri);
2318 *dp.add(wp) = b;
2319 wp += (b < lo || b > hi) as usize;
2320 ri += 1;
2321 }
2322
2323 wp
2324 }
2325}
2326
2327#[cfg(not(target_arch = "x86_64"))]
2331fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2332 let len = src.len();
2333 let sp = src.as_ptr();
2334 let dp = dst.as_mut_ptr();
2335 let mut wp: usize = 0;
2336 let mut i: usize = 0;
2337
2338 while i + 8 <= len {
2340 unsafe {
2341 let b0 = *sp.add(i);
2342 *dp.add(wp) = b0;
2343 wp += (b0 < lo || b0 > hi) as usize;
2344 let b1 = *sp.add(i + 1);
2345 *dp.add(wp) = b1;
2346 wp += (b1 < lo || b1 > hi) as usize;
2347 let b2 = *sp.add(i + 2);
2348 *dp.add(wp) = b2;
2349 wp += (b2 < lo || b2 > hi) as usize;
2350 let b3 = *sp.add(i + 3);
2351 *dp.add(wp) = b3;
2352 wp += (b3 < lo || b3 > hi) as usize;
2353 let b4 = *sp.add(i + 4);
2354 *dp.add(wp) = b4;
2355 wp += (b4 < lo || b4 > hi) as usize;
2356 let b5 = *sp.add(i + 5);
2357 *dp.add(wp) = b5;
2358 wp += (b5 < lo || b5 > hi) as usize;
2359 let b6 = *sp.add(i + 6);
2360 *dp.add(wp) = b6;
2361 wp += (b6 < lo || b6 > hi) as usize;
2362 let b7 = *sp.add(i + 7);
2363 *dp.add(wp) = b7;
2364 wp += (b7 < lo || b7 > hi) as usize;
2365 }
2366 i += 8;
2367 }
2368
2369 while i < len {
2371 unsafe {
2372 let b = *sp.add(i);
2373 *dp.add(wp) = b;
2374 wp += (b < lo || b > hi) as usize;
2375 }
2376 i += 1;
2377 }
2378
2379 wp
2380}
2381
2382fn delete_range_streaming(
2387 lo: u8,
2388 hi: u8,
2389 reader: &mut impl Read,
2390 writer: &mut impl Write,
2391) -> io::Result<()> {
2392 let mut buf = alloc_uninit_vec(STREAM_BUF);
2395 loop {
2396 let n = read_once(reader, &mut buf)?;
2397 if n == 0 {
2398 break;
2399 }
2400 let wp = delete_range_inplace(&mut buf, n, lo, hi);
2401 if wp > 0 {
2402 writer.write_all(&buf[..wp])?;
2403 }
2404 }
2405 Ok(())
2406}
2407
2408#[inline]
2411fn delete_range_inplace(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2412 #[cfg(target_arch = "x86_64")]
2413 {
2414 let level = get_simd_level();
2415 if level >= 3 {
2416 return unsafe { delete_range_inplace_avx2(buf, n, lo, hi) };
2417 }
2418 }
2419 let ptr = buf.as_mut_ptr();
2421 let mut ri = 0;
2422 let mut wp = 0;
2423 unsafe {
2424 while ri + 8 <= n {
2425 let b0 = *ptr.add(ri);
2426 let b1 = *ptr.add(ri + 1);
2427 let b2 = *ptr.add(ri + 2);
2428 let b3 = *ptr.add(ri + 3);
2429 let b4 = *ptr.add(ri + 4);
2430 let b5 = *ptr.add(ri + 5);
2431 let b6 = *ptr.add(ri + 6);
2432 let b7 = *ptr.add(ri + 7);
2433 *ptr.add(wp) = b0;
2434 wp += (b0 < lo || b0 > hi) as usize;
2435 *ptr.add(wp) = b1;
2436 wp += (b1 < lo || b1 > hi) as usize;
2437 *ptr.add(wp) = b2;
2438 wp += (b2 < lo || b2 > hi) as usize;
2439 *ptr.add(wp) = b3;
2440 wp += (b3 < lo || b3 > hi) as usize;
2441 *ptr.add(wp) = b4;
2442 wp += (b4 < lo || b4 > hi) as usize;
2443 *ptr.add(wp) = b5;
2444 wp += (b5 < lo || b5 > hi) as usize;
2445 *ptr.add(wp) = b6;
2446 wp += (b6 < lo || b6 > hi) as usize;
2447 *ptr.add(wp) = b7;
2448 wp += (b7 < lo || b7 > hi) as usize;
2449 ri += 8;
2450 }
2451 while ri < n {
2452 let b = *ptr.add(ri);
2453 *ptr.add(wp) = b;
2454 wp += (b < lo || b > hi) as usize;
2455 ri += 1;
2456 }
2457 }
2458 wp
2459}
2460
2461#[cfg(target_arch = "x86_64")]
2464#[target_feature(enable = "avx2")]
2465unsafe fn delete_range_inplace_avx2(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2466 use std::arch::x86_64::*;
2467
2468 unsafe {
2469 let range = hi - lo;
2470 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2471 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2472 let zero = _mm256_setzero_si256();
2473
2474 let ptr = buf.as_mut_ptr();
2475 let mut ri = 0;
2476 let mut wp = 0;
2477
2478 while ri + 32 <= n {
2479 let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
2480 let biased = _mm256_add_epi8(input, bias_v);
2481 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2482 let in_range = _mm256_cmpeq_epi8(gt, zero);
2483 let del_mask = _mm256_movemask_epi8(in_range) as u32;
2484
2485 if del_mask == 0 {
2486 if wp != ri {
2488 std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
2489 }
2490 wp += 32;
2491 } else if del_mask != 0xFFFFFFFF {
2492 let keep_mask = !del_mask;
2497 let m0 = keep_mask as u8;
2498 let m1 = (keep_mask >> 8) as u8;
2499 let m2 = (keep_mask >> 16) as u8;
2500 let m3 = (keep_mask >> 24) as u8;
2501
2502 let c0 = m0.count_ones() as usize;
2503 let c1 = m1.count_ones() as usize;
2504 let c2 = m2.count_ones() as usize;
2505 let c3 = m3.count_ones() as usize;
2506
2507 if m0 == 0xFF {
2509 std::ptr::copy(ptr.add(ri), ptr.add(wp), 8);
2510 } else if m0 != 0 {
2511 let src_v = _mm_loadl_epi64(ptr.add(ri) as *const _);
2512 let shuf = _mm_loadl_epi64(COMPACT_LUT[m0 as usize].as_ptr() as *const _);
2513 let out_v = _mm_shuffle_epi8(src_v, shuf);
2514 _mm_storel_epi64(ptr.add(wp) as *mut _, out_v);
2515 }
2516
2517 if m1 == 0xFF {
2519 std::ptr::copy(ptr.add(ri + 8), ptr.add(wp + c0), 8);
2520 } else if m1 != 0 {
2521 let src_v = _mm_loadl_epi64(ptr.add(ri + 8) as *const _);
2522 let shuf = _mm_loadl_epi64(COMPACT_LUT[m1 as usize].as_ptr() as *const _);
2523 let out_v = _mm_shuffle_epi8(src_v, shuf);
2524 _mm_storel_epi64(ptr.add(wp + c0) as *mut _, out_v);
2525 }
2526
2527 if m2 == 0xFF {
2529 std::ptr::copy(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
2530 } else if m2 != 0 {
2531 let src_v = _mm_loadl_epi64(ptr.add(ri + 16) as *const _);
2532 let shuf = _mm_loadl_epi64(COMPACT_LUT[m2 as usize].as_ptr() as *const _);
2533 let out_v = _mm_shuffle_epi8(src_v, shuf);
2534 _mm_storel_epi64(ptr.add(wp + c0 + c1) as *mut _, out_v);
2535 }
2536
2537 if m3 == 0xFF {
2539 std::ptr::copy(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
2540 } else if m3 != 0 {
2541 let src_v = _mm_loadl_epi64(ptr.add(ri + 24) as *const _);
2542 let shuf = _mm_loadl_epi64(COMPACT_LUT[m3 as usize].as_ptr() as *const _);
2543 let out_v = _mm_shuffle_epi8(src_v, shuf);
2544 _mm_storel_epi64(ptr.add(wp + c0 + c1 + c2) as *mut _, out_v);
2545 }
2546
2547 wp += c0 + c1 + c2 + c3;
2548 }
2549 ri += 32;
2551 }
2552
2553 while ri < n {
2555 let b = *ptr.add(ri);
2556 *ptr.add(wp) = b;
2557 wp += (b < lo || b > hi) as usize;
2558 ri += 1;
2559 }
2560
2561 wp
2562 }
2563}
2564
2565pub fn translate(
2570 set1: &[u8],
2571 set2: &[u8],
2572 reader: &mut impl Read,
2573 writer: &mut impl Write,
2574) -> io::Result<()> {
2575 let table = build_translate_table(set1, set2);
2576
2577 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
2579 if is_identity {
2580 return passthrough_stream(reader, writer);
2581 }
2582
2583 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
2585 return translate_range_stream(lo, hi, offset, reader, writer);
2586 }
2587
2588 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
2591 return translate_range_to_constant_stream(lo, hi, replacement, reader, writer);
2592 }
2593
2594 let mut buf = alloc_uninit_vec(STREAM_BUF);
2603 loop {
2604 let n = read_once(reader, &mut buf)?;
2605 if n == 0 {
2606 break;
2607 }
2608 if n >= PARALLEL_THRESHOLD {
2609 let nt = rayon::current_num_threads().max(1);
2611 let cs = (n / nt).max(32 * 1024);
2612 buf[..n].par_chunks_mut(cs).for_each(|chunk| {
2613 translate_inplace(chunk, &table);
2614 });
2615 } else {
2616 translate_inplace(&mut buf[..n], &table);
2617 }
2618 writer.write_all(&buf[..n])?;
2619 }
2620 Ok(())
2621}
2622
2623fn translate_range_stream(
2627 lo: u8,
2628 hi: u8,
2629 offset: i8,
2630 reader: &mut impl Read,
2631 writer: &mut impl Write,
2632) -> io::Result<()> {
2633 let mut buf = alloc_uninit_vec(STREAM_BUF);
2634 loop {
2635 let n = read_once(reader, &mut buf)?;
2636 if n == 0 {
2637 break;
2638 }
2639 if n >= PARALLEL_THRESHOLD {
2640 let nt = rayon::current_num_threads().max(1);
2641 let cs = (n / nt).max(32 * 1024);
2642 buf[..n].par_chunks_mut(cs).for_each(|chunk| {
2643 translate_range_simd_inplace(chunk, lo, hi, offset);
2644 });
2645 } else {
2646 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2647 }
2648 writer.write_all(&buf[..n])?;
2649 }
2650 Ok(())
2651}
2652
2653fn translate_range_to_constant_stream(
2657 lo: u8,
2658 hi: u8,
2659 replacement: u8,
2660 reader: &mut impl Read,
2661 writer: &mut impl Write,
2662) -> io::Result<()> {
2663 let mut buf = alloc_uninit_vec(STREAM_BUF);
2664 loop {
2665 let n = read_once(reader, &mut buf)?;
2666 if n == 0 {
2667 break;
2668 }
2669 if n >= PARALLEL_THRESHOLD {
2670 let nt = rayon::current_num_threads().max(1);
2671 let cs = (n / nt).max(32 * 1024);
2672 buf[..n].par_chunks_mut(cs).for_each(|chunk| {
2673 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
2674 });
2675 } else {
2676 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2677 }
2678 writer.write_all(&buf[..n])?;
2679 }
2680 Ok(())
2681}
2682
2683fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
2686 let mut buf = alloc_uninit_vec(STREAM_BUF);
2687 loop {
2688 let n = read_once(reader, &mut buf)?;
2689 if n == 0 {
2690 break;
2691 }
2692 writer.write_all(&buf[..n])?;
2693 }
2694 Ok(())
2695}
2696
2697#[inline]
2703fn read_once(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2704 loop {
2705 match reader.read(buf) {
2706 Ok(n) => return Ok(n),
2707 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2708 Err(e) => return Err(e),
2709 }
2710 }
2711}
2712
2713pub fn translate_squeeze(
2714 set1: &[u8],
2715 set2: &[u8],
2716 reader: &mut impl Read,
2717 writer: &mut impl Write,
2718) -> io::Result<()> {
2719 let table = build_translate_table(set1, set2);
2720 let squeeze_set = build_member_set(set2);
2721
2722 if set2.len() == 1 || (set2.len() > 1 && set2.iter().all(|&b| b == set2[0])) {
2725 let squeeze_ch = set2.last().copied().unwrap_or(0);
2726 return translate_squeeze_single_ch(&table, squeeze_ch, &squeeze_set, reader, writer);
2727 }
2728
2729 let range_info = detect_range_offset(&table);
2733 let range_const_info = if range_info.is_none() {
2734 detect_range_to_constant(&table)
2735 } else {
2736 None
2737 };
2738
2739 let mut buf = alloc_uninit_vec(STREAM_BUF);
2740 let mut last_squeezed: u16 = 256;
2741
2742 loop {
2743 let n = read_once(reader, &mut buf)?;
2744 if n == 0 {
2745 break;
2746 }
2747 if let Some((lo, hi, offset)) = range_info {
2749 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2750 } else if let Some((lo, hi, replacement)) = range_const_info {
2751 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2752 } else {
2753 translate_inplace(&mut buf[..n], &table);
2754 }
2755 let mut wp = 0;
2757 unsafe {
2758 let ptr = buf.as_mut_ptr();
2759 let mut i = 0;
2760 while i + 8 <= n {
2761 macro_rules! squeeze_byte {
2762 ($off:expr) => {
2763 let b = *ptr.add(i + $off);
2764 if is_member(&squeeze_set, b) {
2765 if last_squeezed != b as u16 {
2766 last_squeezed = b as u16;
2767 *ptr.add(wp) = b;
2768 wp += 1;
2769 }
2770 } else {
2771 last_squeezed = 256;
2772 *ptr.add(wp) = b;
2773 wp += 1;
2774 }
2775 };
2776 }
2777 squeeze_byte!(0);
2778 squeeze_byte!(1);
2779 squeeze_byte!(2);
2780 squeeze_byte!(3);
2781 squeeze_byte!(4);
2782 squeeze_byte!(5);
2783 squeeze_byte!(6);
2784 squeeze_byte!(7);
2785 i += 8;
2786 }
2787 while i < n {
2788 let b = *ptr.add(i);
2789 if is_member(&squeeze_set, b) {
2790 if last_squeezed == b as u16 {
2791 i += 1;
2792 continue;
2793 }
2794 last_squeezed = b as u16;
2795 } else {
2796 last_squeezed = 256;
2797 }
2798 *ptr.add(wp) = b;
2799 wp += 1;
2800 i += 1;
2801 }
2802 }
2803 writer.write_all(&buf[..wp])?;
2804 }
2805 Ok(())
2806}
2807
2808fn translate_squeeze_single_ch(
2812 table: &[u8; 256],
2813 squeeze_ch: u8,
2814 _squeeze_set: &[u8; 32],
2815 reader: &mut impl Read,
2816 writer: &mut impl Write,
2817) -> io::Result<()> {
2818 let range_info = detect_range_offset(table);
2819 let range_const_info = if range_info.is_none() {
2820 detect_range_to_constant(table)
2821 } else {
2822 None
2823 };
2824
2825 let pair = [squeeze_ch, squeeze_ch];
2826 let finder = memchr::memmem::Finder::new(&pair);
2827 let mut buf = alloc_uninit_vec(STREAM_BUF);
2828 let mut was_squeeze_char = false;
2829
2830 loop {
2831 let n = read_once(reader, &mut buf)?;
2832 if n == 0 {
2833 break;
2834 }
2835 if let Some((lo, hi, offset)) = range_info {
2837 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2838 } else if let Some((lo, hi, replacement)) = range_const_info {
2839 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2840 } else {
2841 translate_inplace(&mut buf[..n], table);
2842 }
2843
2844 let mut i = 0;
2846
2847 if was_squeeze_char {
2849 while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2850 i += 1;
2851 }
2852 if i >= n {
2853 continue;
2854 }
2855 }
2856
2857 let ptr = buf.as_mut_ptr();
2858 let mut wp = 0usize;
2859
2860 loop {
2861 match finder.find(&buf[i..n]) {
2862 Some(offset) => {
2863 let seg_end = i + offset + 1;
2864 let gap = seg_end - i;
2865 if gap > 0 {
2866 if wp != i {
2867 unsafe {
2868 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
2869 }
2870 }
2871 wp += gap;
2872 }
2873 i = seg_end;
2874 while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2875 i += 1;
2876 }
2877 if i >= n {
2878 was_squeeze_char = true;
2879 break;
2880 }
2881 }
2882 None => {
2883 let rem = n - i;
2884 if rem > 0 {
2885 if wp != i {
2886 unsafe {
2887 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
2888 }
2889 }
2890 wp += rem;
2891 }
2892 was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == squeeze_ch;
2893 break;
2894 }
2895 }
2896 }
2897
2898 if wp > 0 {
2899 writer.write_all(&buf[..wp])?;
2900 }
2901 }
2902 Ok(())
2903}
2904
2905pub fn delete(
2906 delete_chars: &[u8],
2907 reader: &mut impl Read,
2908 writer: &mut impl Write,
2909) -> io::Result<()> {
2910 if delete_chars.len() == 1 {
2911 return delete_single_streaming(delete_chars[0], reader, writer);
2912 }
2913 if delete_chars.len() <= 3 {
2914 return delete_multi_streaming(delete_chars, reader, writer);
2915 }
2916
2917 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
2921 return delete_range_streaming(lo, hi, reader, writer);
2922 }
2923
2924 let member = build_member_set(delete_chars);
2925 let mut buf = alloc_uninit_vec(STREAM_BUF);
2926 let mut outbuf = alloc_uninit_vec(STREAM_BUF);
2929
2930 loop {
2931 let n = read_once(reader, &mut buf)?;
2932 if n == 0 {
2933 break;
2934 }
2935 #[cfg(target_arch = "x86_64")]
2936 let wp = if get_simd_level() >= 3 {
2937 unsafe { delete_bitset_avx2_stream(&buf[..n], &mut outbuf, &member) }
2938 } else {
2939 delete_bitset_scalar(&buf[..n], &mut outbuf, &member)
2940 };
2941 #[cfg(not(target_arch = "x86_64"))]
2942 let wp = delete_bitset_scalar(&buf[..n], &mut outbuf, &member);
2943 if wp > 0 {
2944 writer.write_all(&outbuf[..wp])?;
2945 }
2946 }
2947 Ok(())
2948}
2949
2950#[inline]
2952fn delete_bitset_scalar(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
2953 let n = src.len();
2954 let mut wp = 0;
2955 unsafe {
2956 let sp = src.as_ptr();
2957 let dp = dst.as_mut_ptr();
2958 let mut i = 0;
2959 while i + 8 <= n {
2960 let b0 = *sp.add(i);
2961 let b1 = *sp.add(i + 1);
2962 let b2 = *sp.add(i + 2);
2963 let b3 = *sp.add(i + 3);
2964 let b4 = *sp.add(i + 4);
2965 let b5 = *sp.add(i + 5);
2966 let b6 = *sp.add(i + 6);
2967 let b7 = *sp.add(i + 7);
2968 *dp.add(wp) = b0;
2969 wp += !is_member(member, b0) as usize;
2970 *dp.add(wp) = b1;
2971 wp += !is_member(member, b1) as usize;
2972 *dp.add(wp) = b2;
2973 wp += !is_member(member, b2) as usize;
2974 *dp.add(wp) = b3;
2975 wp += !is_member(member, b3) as usize;
2976 *dp.add(wp) = b4;
2977 wp += !is_member(member, b4) as usize;
2978 *dp.add(wp) = b5;
2979 wp += !is_member(member, b5) as usize;
2980 *dp.add(wp) = b6;
2981 wp += !is_member(member, b6) as usize;
2982 *dp.add(wp) = b7;
2983 wp += !is_member(member, b7) as usize;
2984 i += 8;
2985 }
2986 while i < n {
2987 let b = *sp.add(i);
2988 *dp.add(wp) = b;
2989 wp += !is_member(member, b) as usize;
2990 i += 1;
2991 }
2992 }
2993 wp
2994}
2995
2996#[cfg(target_arch = "x86_64")]
2999#[target_feature(enable = "avx2")]
3000unsafe fn delete_bitset_avx2_stream(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3001 use std::arch::x86_64::*;
3002
3003 unsafe {
3004 let n = src.len();
3005 let sp = src.as_ptr();
3006 let dp = dst.as_mut_ptr();
3007 let mut ri = 0;
3008 let mut wp = 0;
3009
3010 let member_v = _mm256_loadu_si256(member.as_ptr() as *const _);
3013
3014 let mask7 = _mm256_set1_epi8(7);
3017 let mask_0x1f = _mm256_set1_epi8(0x1F_u8 as i8);
3018
3019 let bit_table = _mm256_setr_epi8(
3022 1, 2, 4, 8, 16, 32, 64, -128i8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, -128i8,
3023 0, 0, 0, 0, 0, 0, 0, 0,
3024 );
3025
3026 while ri + 32 <= n {
3027 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
3028
3029 let byte_idx = _mm256_and_si256(_mm256_srli_epi16(input, 3), mask_0x1f);
3031 let bit_pos = _mm256_and_si256(input, mask7);
3033 let bit_mask = _mm256_shuffle_epi8(bit_table, bit_pos);
3035
3036 let member_lo = _mm256_broadcastsi128_si256(_mm256_castsi256_si128(member_v));
3044 let member_hi = _mm256_broadcastsi128_si256(_mm256_extracti128_si256(member_v, 1));
3045 let lo_mask = _mm256_set1_epi8(0x0F);
3046 let idx_lo = _mm256_and_si256(byte_idx, lo_mask);
3047 let shuffled_lo = _mm256_shuffle_epi8(member_lo, idx_lo);
3048 let shuffled_hi = _mm256_shuffle_epi8(member_hi, idx_lo);
3049 let use_hi = _mm256_slli_epi16(byte_idx, 3); let member_byte = _mm256_blendv_epi8(shuffled_lo, shuffled_hi, use_hi);
3052
3053 let test = _mm256_and_si256(member_byte, bit_mask);
3055 let is_zero = _mm256_cmpeq_epi8(test, _mm256_setzero_si256());
3056 let keep_mask = _mm256_movemask_epi8(is_zero) as u32;
3058
3059 if keep_mask == 0xFFFFFFFF {
3060 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
3062 wp += 32;
3063 } else if keep_mask != 0 {
3064 let m0 = keep_mask as u8;
3066 let m1 = (keep_mask >> 8) as u8;
3067 let m2 = (keep_mask >> 16) as u8;
3068 let m3 = (keep_mask >> 24) as u8;
3069
3070 if m0 == 0xFF {
3071 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
3072 } else if m0 != 0 {
3073 compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
3074 }
3075 let c0 = m0.count_ones() as usize;
3076
3077 if m1 == 0xFF {
3078 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
3079 } else if m1 != 0 {
3080 compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
3081 }
3082 let c1 = m1.count_ones() as usize;
3083
3084 if m2 == 0xFF {
3085 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
3086 } else if m2 != 0 {
3087 compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
3088 }
3089 let c2 = m2.count_ones() as usize;
3090
3091 if m3 == 0xFF {
3092 std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
3093 } else if m3 != 0 {
3094 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
3095 }
3096 let c3 = m3.count_ones() as usize;
3097 wp += c0 + c1 + c2 + c3;
3098 }
3099 ri += 32;
3101 }
3102
3103 while ri < n {
3105 let b = *sp.add(ri);
3106 *dp.add(wp) = b;
3107 wp += !is_member(member, b) as usize;
3108 ri += 1;
3109 }
3110
3111 wp
3112 }
3113}
3114
3115fn delete_single_streaming(
3116 ch: u8,
3117 reader: &mut impl Read,
3118 writer: &mut impl Write,
3119) -> io::Result<()> {
3120 let mut buf = alloc_uninit_vec(STREAM_BUF);
3123 loop {
3124 let n = read_once(reader, &mut buf)?;
3125 if n == 0 {
3126 break;
3127 }
3128 let mut wp = 0;
3129 let mut i = 0;
3130 while i < n {
3131 match memchr::memchr(ch, &buf[i..n]) {
3132 Some(offset) => {
3133 if offset > 0 {
3134 if wp != i {
3135 unsafe {
3136 std::ptr::copy(
3137 buf.as_ptr().add(i),
3138 buf.as_mut_ptr().add(wp),
3139 offset,
3140 );
3141 }
3142 }
3143 wp += offset;
3144 }
3145 i += offset + 1;
3146 }
3147 None => {
3148 let run_len = n - i;
3149 if run_len > 0 {
3150 if wp != i {
3151 unsafe {
3152 std::ptr::copy(
3153 buf.as_ptr().add(i),
3154 buf.as_mut_ptr().add(wp),
3155 run_len,
3156 );
3157 }
3158 }
3159 wp += run_len;
3160 }
3161 break;
3162 }
3163 }
3164 }
3165 if wp > 0 {
3166 writer.write_all(&buf[..wp])?;
3167 }
3168 }
3169 Ok(())
3170}
3171
3172fn delete_multi_streaming(
3173 chars: &[u8],
3174 reader: &mut impl Read,
3175 writer: &mut impl Write,
3176) -> io::Result<()> {
3177 let mut buf = alloc_uninit_vec(STREAM_BUF);
3180 loop {
3181 let n = read_once(reader, &mut buf)?;
3182 if n == 0 {
3183 break;
3184 }
3185 let mut wp = 0;
3186 let mut i = 0;
3187 while i < n {
3188 let found = if chars.len() == 2 {
3189 memchr::memchr2(chars[0], chars[1], &buf[i..n])
3190 } else {
3191 memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
3192 };
3193 match found {
3194 Some(offset) => {
3195 if offset > 0 {
3196 if wp != i {
3197 unsafe {
3198 std::ptr::copy(
3199 buf.as_ptr().add(i),
3200 buf.as_mut_ptr().add(wp),
3201 offset,
3202 );
3203 }
3204 }
3205 wp += offset;
3206 }
3207 i += offset + 1;
3208 }
3209 None => {
3210 let run_len = n - i;
3211 if run_len > 0 {
3212 if wp != i {
3213 unsafe {
3214 std::ptr::copy(
3215 buf.as_ptr().add(i),
3216 buf.as_mut_ptr().add(wp),
3217 run_len,
3218 );
3219 }
3220 }
3221 wp += run_len;
3222 }
3223 break;
3224 }
3225 }
3226 }
3227 if wp > 0 {
3228 writer.write_all(&buf[..wp])?;
3229 }
3230 }
3231 Ok(())
3232}
3233
3234pub fn delete_squeeze(
3235 delete_chars: &[u8],
3236 squeeze_chars: &[u8],
3237 reader: &mut impl Read,
3238 writer: &mut impl Write,
3239) -> io::Result<()> {
3240 let delete_set = build_member_set(delete_chars);
3241 let squeeze_set = build_member_set(squeeze_chars);
3242 let mut buf = alloc_uninit_vec(STREAM_BUF);
3243 let mut last_squeezed: u16 = 256;
3244
3245 loop {
3246 let n = read_once(reader, &mut buf)?;
3247 if n == 0 {
3248 break;
3249 }
3250 let mut wp = 0;
3254 unsafe {
3255 let ptr = buf.as_mut_ptr();
3256 let mut i = 0;
3257 while i + 8 <= n {
3258 macro_rules! process_byte {
3259 ($off:expr) => {
3260 let b = *ptr.add(i + $off);
3261 if !is_member(&delete_set, b) {
3262 if is_member(&squeeze_set, b) {
3263 if last_squeezed != b as u16 {
3264 last_squeezed = b as u16;
3265 *ptr.add(wp) = b;
3266 wp += 1;
3267 }
3268 } else {
3269 last_squeezed = 256;
3270 *ptr.add(wp) = b;
3271 wp += 1;
3272 }
3273 }
3274 };
3275 }
3276 process_byte!(0);
3277 process_byte!(1);
3278 process_byte!(2);
3279 process_byte!(3);
3280 process_byte!(4);
3281 process_byte!(5);
3282 process_byte!(6);
3283 process_byte!(7);
3284 i += 8;
3285 }
3286 while i < n {
3287 let b = *ptr.add(i);
3288 if !is_member(&delete_set, b) {
3289 if is_member(&squeeze_set, b) {
3290 if last_squeezed != b as u16 {
3291 last_squeezed = b as u16;
3292 *ptr.add(wp) = b;
3293 wp += 1;
3294 }
3295 } else {
3296 last_squeezed = 256;
3297 *ptr.add(wp) = b;
3298 wp += 1;
3299 }
3300 }
3301 i += 1;
3302 }
3303 }
3304 writer.write_all(&buf[..wp])?;
3305 }
3306 Ok(())
3307}
3308
3309pub fn squeeze(
3310 squeeze_chars: &[u8],
3311 reader: &mut impl Read,
3312 writer: &mut impl Write,
3313) -> io::Result<()> {
3314 if squeeze_chars.len() == 1 {
3315 return squeeze_single_stream(squeeze_chars[0], reader, writer);
3316 }
3317
3318 if squeeze_chars.len() <= 3 {
3321 return squeeze_multi_stream(squeeze_chars, reader, writer);
3322 }
3323
3324 let member = build_member_set(squeeze_chars);
3325 let mut buf = alloc_uninit_vec(STREAM_BUF);
3326 let mut last_squeezed: u16 = 256;
3327
3328 loop {
3329 let n = read_once(reader, &mut buf)?;
3330 if n == 0 {
3331 break;
3332 }
3333 let mut wp = 0;
3334 unsafe {
3335 let ptr = buf.as_mut_ptr();
3336 for i in 0..n {
3337 let b = *ptr.add(i);
3338 if is_member(&member, b) {
3339 if last_squeezed == b as u16 {
3340 continue;
3341 }
3342 last_squeezed = b as u16;
3343 } else {
3344 last_squeezed = 256;
3345 }
3346 *ptr.add(wp) = b;
3347 wp += 1;
3348 }
3349 }
3350 writer.write_all(&buf[..wp])?;
3351 }
3352 Ok(())
3353}
3354
3355fn squeeze_multi_stream(
3359 chars: &[u8],
3360 reader: &mut impl Read,
3361 writer: &mut impl Write,
3362) -> io::Result<()> {
3363 let c0 = chars[0];
3364 let c1 = chars[1];
3365 let c2 = if chars.len() >= 3 {
3366 Some(chars[2])
3367 } else {
3368 None
3369 };
3370 let single_byte = [0u8; 1]; let _ = single_byte;
3372
3373 let mut buf = alloc_uninit_vec(STREAM_BUF);
3374 let mut last_squeezed: u16 = 256;
3375
3376 loop {
3377 let n = read_once(reader, &mut buf)?;
3378 if n == 0 {
3379 break;
3380 }
3381
3382 let ptr = buf.as_mut_ptr();
3386 let mut wp = 0usize;
3387 let mut cursor = 0usize;
3388
3389 macro_rules! find_next {
3390 ($start:expr) => {
3391 if let Some(c) = c2 {
3392 memchr::memchr3(c0, c1, c, &buf[$start..n])
3393 } else {
3394 memchr::memchr2(c0, c1, &buf[$start..n])
3395 }
3396 };
3397 }
3398
3399 while cursor < n {
3400 match find_next!(cursor) {
3401 Some(offset) => {
3402 let pos = cursor + offset;
3403 let b = unsafe { *ptr.add(pos) };
3404
3405 let gap = pos - cursor;
3407 if gap > 0 {
3408 if wp != cursor {
3409 unsafe {
3410 std::ptr::copy(ptr.add(cursor), ptr.add(wp), gap);
3411 }
3412 }
3413 wp += gap;
3414 last_squeezed = 256;
3415 }
3416
3417 if last_squeezed != b as u16 {
3419 unsafe { *ptr.add(wp) = b };
3420 wp += 1;
3421 last_squeezed = b as u16;
3422 }
3423
3424 cursor = pos + 1;
3426 while cursor < n && unsafe { *ptr.add(cursor) } == b {
3427 cursor += 1;
3428 }
3429 }
3430 None => {
3431 let rem = n - cursor;
3433 if rem > 0 {
3434 if wp != cursor {
3435 unsafe {
3436 std::ptr::copy(ptr.add(cursor), ptr.add(wp), rem);
3437 }
3438 }
3439 wp += rem;
3440 last_squeezed = 256;
3441 }
3442 break;
3443 }
3444 }
3445 }
3446
3447 writer.write_all(&buf[..wp])?;
3448 }
3449 Ok(())
3450}
3451
3452fn squeeze_single_stream(
3453 ch: u8,
3454 reader: &mut impl Read,
3455 writer: &mut impl Write,
3456) -> io::Result<()> {
3457 let pair = [ch, ch];
3461 let finder = memchr::memmem::Finder::new(&pair);
3462 let mut buf = alloc_uninit_vec(STREAM_BUF);
3463 let mut was_squeeze_char = false;
3464
3465 loop {
3466 let n = read_once(reader, &mut buf)?;
3467 if n == 0 {
3468 break;
3469 }
3470
3471 let mut i = 0;
3472
3473 if was_squeeze_char {
3476 while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3477 i += 1;
3478 }
3479 if i >= n {
3480 continue;
3481 }
3482 }
3483
3484 let ptr = buf.as_mut_ptr();
3486 let mut wp = 0usize;
3487
3488 loop {
3489 match finder.find(&buf[i..n]) {
3490 Some(offset) => {
3491 let seg_end = i + offset + 1;
3493 let gap = seg_end - i;
3494 if gap > 0 {
3495 if wp != i {
3496 unsafe {
3497 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
3498 }
3499 }
3500 wp += gap;
3501 }
3502 i = seg_end;
3503 while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3505 i += 1;
3506 }
3507 if i >= n {
3508 was_squeeze_char = true;
3509 break;
3510 }
3511 }
3512 None => {
3513 let rem = n - i;
3515 if rem > 0 {
3516 if wp != i {
3517 unsafe {
3518 std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
3519 }
3520 }
3521 wp += rem;
3522 }
3523 was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == ch;
3524 break;
3525 }
3526 }
3527 }
3528
3529 if wp > 0 {
3530 writer.write_all(&buf[..wp])?;
3531 }
3532 }
3533 Ok(())
3534}
3535
3536pub fn translate_owned(
3544 set1: &[u8],
3545 set2: &[u8],
3546 data: &mut [u8],
3547 writer: &mut impl Write,
3548) -> io::Result<()> {
3549 let table = build_translate_table(set1, set2);
3550
3551 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3553 if is_identity {
3554 return writer.write_all(data);
3555 }
3556
3557 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3559 if data.len() >= PARALLEL_THRESHOLD {
3560 let n_threads = rayon::current_num_threads().max(1);
3561 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3562 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3563 translate_range_simd_inplace(chunk, lo, hi, offset);
3564 });
3565 } else {
3566 translate_range_simd_inplace(data, lo, hi, offset);
3567 }
3568 return writer.write_all(data);
3569 }
3570
3571 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3573 if data.len() >= PARALLEL_THRESHOLD {
3574 let n_threads = rayon::current_num_threads().max(1);
3575 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3576 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3577 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
3578 });
3579 } else {
3580 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3581 }
3582 return writer.write_all(data);
3583 }
3584
3585 if data.len() >= PARALLEL_THRESHOLD {
3587 let n_threads = rayon::current_num_threads().max(1);
3588 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3589 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3590 translate_inplace(chunk, &table);
3591 });
3592 } else {
3593 translate_inplace(data, &table);
3594 }
3595 writer.write_all(data)
3596}
3597
3598pub fn translate_mmap(
3612 set1: &[u8],
3613 set2: &[u8],
3614 data: &[u8],
3615 writer: &mut impl Write,
3616) -> io::Result<()> {
3617 let table = build_translate_table(set1, set2);
3618
3619 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3621 if is_identity {
3622 return writer.write_all(data);
3623 }
3624
3625 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3627 return translate_mmap_range(data, writer, lo, hi, offset);
3628 }
3629
3630 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3632 return translate_mmap_range_to_constant(data, writer, lo, hi, replacement);
3633 }
3634
3635 translate_mmap_table(data, writer, &table)
3637}
3638
3639fn translate_mmap_range(
3641 data: &[u8],
3642 writer: &mut impl Write,
3643 lo: u8,
3644 hi: u8,
3645 offset: i8,
3646) -> io::Result<()> {
3647 if data.len() >= PARALLEL_THRESHOLD {
3649 let mut buf = alloc_uninit_vec(data.len());
3650 let n_threads = rayon::current_num_threads().max(1);
3651 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3652
3653 data.par_chunks(chunk_size)
3655 .zip(buf.par_chunks_mut(chunk_size))
3656 .for_each(|(src_chunk, dst_chunk)| {
3657 translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
3658 });
3659
3660 return writer.write_all(&buf);
3661 }
3662
3663 const CHUNK: usize = 256 * 1024;
3665 let buf_size = data.len().min(CHUNK);
3666 let mut buf = alloc_uninit_vec(buf_size);
3667 for chunk in data.chunks(CHUNK) {
3668 translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
3669 writer.write_all(&buf[..chunk.len()])?;
3670 }
3671 Ok(())
3672}
3673
3674fn translate_mmap_range_to_constant(
3677 data: &[u8],
3678 writer: &mut impl Write,
3679 lo: u8,
3680 hi: u8,
3681 replacement: u8,
3682) -> io::Result<()> {
3683 if data.len() >= PARALLEL_THRESHOLD {
3685 let mut buf = alloc_uninit_vec(data.len());
3686 let n_threads = rayon::current_num_threads().max(1);
3687 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3688
3689 data.par_chunks(chunk_size)
3691 .zip(buf.par_chunks_mut(chunk_size))
3692 .for_each(|(src_chunk, dst_chunk)| {
3693 dst_chunk[..src_chunk.len()].copy_from_slice(src_chunk);
3694 translate_range_to_constant_simd_inplace(
3695 &mut dst_chunk[..src_chunk.len()],
3696 lo,
3697 hi,
3698 replacement,
3699 );
3700 });
3701
3702 return writer.write_all(&buf);
3703 }
3704
3705 const CHUNK: usize = 256 * 1024;
3707 let buf_size = data.len().min(CHUNK);
3708 let mut buf = alloc_uninit_vec(buf_size);
3709 for chunk in data.chunks(CHUNK) {
3710 buf[..chunk.len()].copy_from_slice(chunk);
3711 translate_range_to_constant_simd_inplace(&mut buf[..chunk.len()], lo, hi, replacement);
3712 writer.write_all(&buf[..chunk.len()])?;
3713 }
3714 Ok(())
3715}
3716
3717fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
3719 if data.len() >= PARALLEL_THRESHOLD {
3721 let mut buf = alloc_uninit_vec(data.len());
3722 let n_threads = rayon::current_num_threads().max(1);
3723 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3724
3725 data.par_chunks(chunk_size)
3726 .zip(buf.par_chunks_mut(chunk_size))
3727 .for_each(|(src_chunk, dst_chunk)| {
3728 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
3729 });
3730
3731 return writer.write_all(&buf);
3732 }
3733
3734 const CHUNK: usize = 256 * 1024;
3736 let buf_size = data.len().min(CHUNK);
3737 let mut buf = alloc_uninit_vec(buf_size);
3738 for chunk in data.chunks(CHUNK) {
3739 translate_to(chunk, &mut buf[..chunk.len()], table);
3740 writer.write_all(&buf[..chunk.len()])?;
3741 }
3742 Ok(())
3743}
3744
3745pub fn translate_mmap_inplace(
3752 set1: &[u8],
3753 set2: &[u8],
3754 data: &mut [u8],
3755 writer: &mut impl Write,
3756) -> io::Result<()> {
3757 let table = build_translate_table(set1, set2);
3758
3759 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3761 if is_identity {
3762 return writer.write_all(data);
3763 }
3764
3765 const SEPARATE_BUF_THRESHOLD: usize = 64 * 1024 * 1024;
3772
3773 if data.len() < SEPARATE_BUF_THRESHOLD {
3774 return translate_to_separate_buf(data, &table, writer);
3775 }
3776
3777 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3779 if data.len() >= PARALLEL_THRESHOLD {
3780 let n_threads = rayon::current_num_threads().max(1);
3781 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3782 data.par_chunks_mut(chunk_size)
3783 .for_each(|chunk| translate_range_simd_inplace(chunk, lo, hi, offset));
3784 } else {
3785 translate_range_simd_inplace(data, lo, hi, offset);
3786 }
3787 return writer.write_all(data);
3788 }
3789
3790 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3792 if data.len() >= PARALLEL_THRESHOLD {
3793 let n_threads = rayon::current_num_threads().max(1);
3794 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3795 data.par_chunks_mut(chunk_size).for_each(|chunk| {
3796 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement)
3797 });
3798 } else {
3799 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3800 }
3801 return writer.write_all(data);
3802 }
3803
3804 if data.len() >= PARALLEL_THRESHOLD {
3806 let n_threads = rayon::current_num_threads().max(1);
3807 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3808 data.par_chunks_mut(chunk_size)
3809 .for_each(|chunk| translate_inplace(chunk, &table));
3810 } else {
3811 translate_inplace(data, &table);
3812 }
3813 writer.write_all(data)
3814}
3815
3816fn translate_to_separate_buf(
3824 data: &[u8],
3825 table: &[u8; 256],
3826 writer: &mut impl Write,
3827) -> io::Result<()> {
3828 let range_info = detect_range_offset(table);
3829 let const_info = if range_info.is_none() {
3830 detect_range_to_constant(table)
3831 } else {
3832 None
3833 };
3834
3835 if data.len() >= PARALLEL_THRESHOLD {
3836 let mut out_buf = alloc_uninit_vec(data.len());
3838 let n_threads = rayon::current_num_threads().max(1);
3839 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3840
3841 if let Some((lo, hi, offset)) = range_info {
3842 data.par_chunks(chunk_size)
3843 .zip(out_buf.par_chunks_mut(chunk_size))
3844 .for_each(|(src, dst)| {
3845 translate_range_simd(src, &mut dst[..src.len()], lo, hi, offset);
3846 });
3847 } else if let Some((lo, hi, replacement)) = const_info {
3848 data.par_chunks(chunk_size)
3849 .zip(out_buf.par_chunks_mut(chunk_size))
3850 .for_each(|(src, dst)| {
3851 translate_range_to_constant_simd(
3852 src,
3853 &mut dst[..src.len()],
3854 lo,
3855 hi,
3856 replacement,
3857 );
3858 });
3859 } else {
3860 data.par_chunks(chunk_size)
3861 .zip(out_buf.par_chunks_mut(chunk_size))
3862 .for_each(|(src, dst)| {
3863 translate_to(src, &mut dst[..src.len()], table);
3864 });
3865 }
3866 return writer.write_all(&out_buf);
3867 }
3868
3869 let mut out_buf = alloc_uninit_vec(data.len());
3875 if let Some((lo, hi, offset)) = range_info {
3876 translate_range_simd(data, &mut out_buf, lo, hi, offset);
3877 } else if let Some((lo, hi, replacement)) = const_info {
3878 translate_range_to_constant_simd(data, &mut out_buf, lo, hi, replacement);
3879 } else {
3880 translate_to(data, &mut out_buf, table);
3881 }
3882 writer.write_all(&out_buf)
3883}
3884
3885pub fn translate_mmap_readonly(
3889 set1: &[u8],
3890 set2: &[u8],
3891 data: &[u8],
3892 writer: &mut impl Write,
3893) -> io::Result<()> {
3894 let table = build_translate_table(set1, set2);
3895
3896 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3898 if is_identity {
3899 return writer.write_all(data);
3900 }
3901
3902 translate_to_separate_buf(data, &table, writer)
3903}
3904
3905pub fn translate_squeeze_mmap(
3911 set1: &[u8],
3912 set2: &[u8],
3913 data: &[u8],
3914 writer: &mut impl Write,
3915) -> io::Result<()> {
3916 let table = build_translate_table(set1, set2);
3917 let squeeze_set = build_member_set(set2);
3918
3919 if data.len() >= PARALLEL_THRESHOLD {
3924 let mut translated = alloc_uninit_vec(data.len());
3926 let range_info = detect_range_offset(&table);
3927 let n_threads = rayon::current_num_threads().max(1);
3928 let chunk_size = (data.len() / n_threads).max(32 * 1024);
3929
3930 if let Some((lo, hi, offset)) = range_info {
3931 data.par_chunks(chunk_size)
3932 .zip(translated.par_chunks_mut(chunk_size))
3933 .for_each(|(src_chunk, dst_chunk)| {
3934 translate_range_simd(
3935 src_chunk,
3936 &mut dst_chunk[..src_chunk.len()],
3937 lo,
3938 hi,
3939 offset,
3940 );
3941 });
3942 } else {
3943 data.par_chunks(chunk_size)
3944 .zip(translated.par_chunks_mut(chunk_size))
3945 .for_each(|(src_chunk, dst_chunk)| {
3946 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
3947 });
3948 }
3949
3950 let mut last_squeezed: u16 = 256;
3954 let len = translated.len();
3955 let mut wp = 0;
3956 unsafe {
3957 let ptr = translated.as_mut_ptr();
3958 let mut i = 0;
3959 while i < len {
3960 let b = *ptr.add(i);
3961 if is_member(&squeeze_set, b) {
3962 if last_squeezed == b as u16 {
3963 i += 1;
3964 continue;
3965 }
3966 last_squeezed = b as u16;
3967 } else {
3968 last_squeezed = 256;
3969 }
3970 *ptr.add(wp) = b;
3971 wp += 1;
3972 i += 1;
3973 }
3974 }
3975 return writer.write_all(&translated[..wp]);
3976 }
3977
3978 let mut buf = alloc_uninit_vec(data.len());
3981 translate_to(data, &mut buf, &table);
3982 let mut last_squeezed: u16 = 256;
3983 let mut wp = 0;
3984 unsafe {
3985 let ptr = buf.as_mut_ptr();
3986 for i in 0..data.len() {
3987 let b = *ptr.add(i);
3988 if is_member(&squeeze_set, b) {
3989 if last_squeezed == b as u16 {
3990 continue;
3991 }
3992 last_squeezed = b as u16;
3993 } else {
3994 last_squeezed = 256;
3995 }
3996 *ptr.add(wp) = b;
3997 wp += 1;
3998 }
3999 }
4000 writer.write_all(&buf[..wp])
4001}
4002
4003pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4009 if delete_chars.len() == 1 {
4010 return delete_single_char_mmap(delete_chars[0], data, writer);
4011 }
4012 if delete_chars.len() <= 3 {
4013 return delete_multi_memchr_mmap(delete_chars, data, writer);
4014 }
4015
4016 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
4018 return delete_range_mmap(data, writer, lo, hi);
4019 }
4020
4021 let member = build_member_set(delete_chars);
4022
4023 let sample_size = data.len().min(1024);
4028 let sample_deletes = data[..sample_size]
4029 .iter()
4030 .filter(|&&b| is_member(&member, b))
4031 .count();
4032 let estimated_deletes = if sample_size > 0 {
4033 data.len() * sample_deletes / sample_size
4034 } else {
4035 data.len()
4036 };
4037
4038 if estimated_deletes < MAX_IOV / 2 {
4039 return delete_bitset_zerocopy(data, &member, writer);
4040 }
4041
4042 if data.len() >= PARALLEL_THRESHOLD {
4044 let n_threads = rayon::current_num_threads().max(1);
4045 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4046
4047 let mut outbuf = alloc_uninit_vec(data.len());
4048 let chunk_lens: Vec<usize> = data
4049 .par_chunks(chunk_size)
4050 .zip(outbuf.par_chunks_mut(chunk_size))
4051 .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
4052 .collect();
4053
4054 let slices: Vec<std::io::IoSlice> = chunk_lens
4058 .iter()
4059 .enumerate()
4060 .filter(|&(_, &len)| len > 0)
4061 .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4062 .collect();
4063 return write_ioslices(writer, &slices);
4064 }
4065
4066 const COMPACT_BUF: usize = 256 * 1024;
4069 let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4070
4071 for chunk in data.chunks(COMPACT_BUF) {
4072 let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
4073 if out_pos > 0 {
4074 writer.write_all(&outbuf[..out_pos])?;
4075 }
4076 }
4077 Ok(())
4078}
4079
4080fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
4085 let sample_size = data.len().min(1024);
4087 let sample_deletes = data[..sample_size]
4088 .iter()
4089 .filter(|&&b| b >= lo && b <= hi)
4090 .count();
4091 let estimated_deletes = if sample_size > 0 {
4097 data.len() * sample_deletes / sample_size
4098 } else {
4099 data.len()
4100 };
4101 if estimated_deletes < MAX_IOV / 2 {
4102 return delete_range_mmap_zerocopy(data, writer, lo, hi);
4103 }
4104
4105 if data.len() >= PARALLEL_THRESHOLD {
4107 let n_threads = rayon::current_num_threads().max(1);
4108 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4109
4110 let mut outbuf = alloc_uninit_vec(data.len());
4111 let chunk_lens: Vec<usize> = data
4112 .par_chunks(chunk_size)
4113 .zip(outbuf.par_chunks_mut(chunk_size))
4114 .map(|(src_chunk, dst_chunk)| delete_range_chunk(src_chunk, dst_chunk, lo, hi))
4115 .collect();
4116
4117 let slices: Vec<std::io::IoSlice> = chunk_lens
4120 .iter()
4121 .enumerate()
4122 .filter(|&(_, &len)| len > 0)
4123 .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4124 .collect();
4125 return write_ioslices(writer, &slices);
4126 }
4127
4128 const COMPACT_BUF: usize = 256 * 1024;
4132 let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4133
4134 #[cfg(target_arch = "x86_64")]
4135 {
4136 let mut wp = 0;
4137 let level = get_simd_level();
4138 let len = data.len();
4139 let sp = data.as_ptr();
4140 let dp = outbuf.as_mut_ptr();
4141 let mut ri = 0;
4142
4143 if level >= 3 {
4144 use std::arch::x86_64::*;
4145 let range = hi - lo;
4146 let bias_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8) };
4147 let threshold_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8) };
4148 let zero = unsafe { _mm256_setzero_si256() };
4149
4150 while ri + 32 <= len {
4151 if wp + 32 > COMPACT_BUF {
4153 writer.write_all(&outbuf[..wp])?;
4154 wp = 0;
4155 }
4156
4157 let input = unsafe { _mm256_loadu_si256(sp.add(ri) as *const _) };
4158 let biased = unsafe { _mm256_add_epi8(input, bias_v) };
4159 let gt = unsafe { _mm256_cmpgt_epi8(biased, threshold_v) };
4160 let in_range = unsafe { _mm256_cmpeq_epi8(gt, zero) };
4161 let keep_mask = !(unsafe { _mm256_movemask_epi8(in_range) } as u32);
4162
4163 if keep_mask == 0xFFFFFFFF {
4164 unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32) };
4165 wp += 32;
4166 } else if keep_mask != 0 {
4167 let m0 = keep_mask as u8;
4168 let m1 = (keep_mask >> 8) as u8;
4169 let m2 = (keep_mask >> 16) as u8;
4170 let m3 = (keep_mask >> 24) as u8;
4171
4172 if m0 == 0xFF {
4173 unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8) };
4174 } else if m0 != 0 {
4175 unsafe { compact_8bytes_simd(sp.add(ri), dp.add(wp), m0) };
4176 }
4177 let c0 = m0.count_ones() as usize;
4178
4179 if m1 == 0xFF {
4180 unsafe {
4181 std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8)
4182 };
4183 } else if m1 != 0 {
4184 unsafe { compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1) };
4185 }
4186 let c1 = m1.count_ones() as usize;
4187
4188 if m2 == 0xFF {
4189 unsafe {
4190 std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8)
4191 };
4192 } else if m2 != 0 {
4193 unsafe { compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2) };
4194 }
4195 let c2 = m2.count_ones() as usize;
4196
4197 if m3 == 0xFF {
4198 unsafe {
4199 std::ptr::copy_nonoverlapping(
4200 sp.add(ri + 24),
4201 dp.add(wp + c0 + c1 + c2),
4202 8,
4203 )
4204 };
4205 } else if m3 != 0 {
4206 unsafe {
4207 compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3)
4208 };
4209 }
4210 let c3 = m3.count_ones() as usize;
4211 wp += c0 + c1 + c2 + c3;
4212 }
4213 ri += 32;
4214 }
4215 }
4216
4217 while ri < len {
4219 if wp + 1 > COMPACT_BUF {
4220 writer.write_all(&outbuf[..wp])?;
4221 wp = 0;
4222 }
4223 let b = unsafe { *sp.add(ri) };
4224 unsafe { *dp.add(wp) = b };
4225 wp += (b < lo || b > hi) as usize;
4226 ri += 1;
4227 }
4228
4229 if wp > 0 {
4230 writer.write_all(&outbuf[..wp])?;
4231 }
4232 return Ok(());
4233 }
4234
4235 #[cfg(not(target_arch = "x86_64"))]
4236 {
4237 for chunk in data.chunks(COMPACT_BUF) {
4239 let clen = delete_range_chunk(chunk, &mut outbuf, lo, hi);
4240 if clen > 0 {
4241 writer.write_all(&outbuf[..clen])?;
4242 }
4243 }
4244 return Ok(());
4245 }
4246
4247 #[allow(unreachable_code)]
4248 Ok(())
4249}
4250
4251fn delete_range_mmap_zerocopy(
4256 data: &[u8],
4257 writer: &mut impl Write,
4258 lo: u8,
4259 hi: u8,
4260) -> io::Result<()> {
4261 #[cfg(target_arch = "x86_64")]
4262 {
4263 if get_simd_level() >= 3 {
4264 return unsafe { delete_range_zerocopy_avx2(data, writer, lo, hi) };
4265 }
4266 if get_simd_level() >= 2 {
4267 return unsafe { delete_range_zerocopy_sse2(data, writer, lo, hi) };
4268 }
4269 }
4270
4271 #[cfg(target_arch = "aarch64")]
4272 {
4273 return unsafe { delete_range_zerocopy_neon(data, writer, lo, hi) };
4274 }
4275
4276 #[allow(unreachable_code)]
4278 delete_range_zerocopy_scalar(data, writer, lo, hi)
4279}
4280
4281fn delete_range_zerocopy_scalar(
4284 data: &[u8],
4285 writer: &mut impl Write,
4286 lo: u8,
4287 hi: u8,
4288) -> io::Result<()> {
4289 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4290 let len = data.len();
4291 let mut run_start: usize = 0;
4292 let mut i: usize = 0;
4293
4294 while i < len {
4295 let b = unsafe { *data.get_unchecked(i) };
4296 if b >= lo && b <= hi {
4297 if i > run_start {
4298 iov.push(std::io::IoSlice::new(&data[run_start..i]));
4299 if iov.len() >= MAX_IOV {
4300 write_ioslices(writer, &iov)?;
4301 iov.clear();
4302 }
4303 }
4304 run_start = i + 1;
4305 }
4306 i += 1;
4307 }
4308 if run_start < len {
4309 iov.push(std::io::IoSlice::new(&data[run_start..]));
4310 }
4311 if !iov.is_empty() {
4312 write_ioslices(writer, &iov)?;
4313 }
4314 Ok(())
4315}
4316
4317#[cfg(target_arch = "x86_64")]
4321#[target_feature(enable = "avx2")]
4322unsafe fn delete_range_zerocopy_avx2(
4323 data: &[u8],
4324 writer: &mut impl Write,
4325 lo: u8,
4326 hi: u8,
4327) -> io::Result<()> {
4328 use std::arch::x86_64::*;
4329
4330 unsafe {
4331 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4332 let len = data.len();
4333 let mut run_start: usize = 0;
4334 let mut ri: usize = 0;
4335
4336 let range = hi - lo;
4337 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4338 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
4339 let zero = _mm256_setzero_si256();
4340
4341 while ri + 32 <= len {
4342 let input = _mm256_loadu_si256(data.as_ptr().add(ri) as *const _);
4343 let biased = _mm256_add_epi8(input, bias_v);
4344 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
4345 let in_range = _mm256_cmpeq_epi8(gt, zero);
4346 let del_mask = _mm256_movemask_epi8(in_range) as u32;
4347
4348 if del_mask == 0 {
4349 ri += 32;
4351 continue;
4352 }
4353
4354 let mut m = del_mask;
4356 while m != 0 {
4357 let bit = m.trailing_zeros() as usize;
4358 let abs_pos = ri + bit;
4359 if abs_pos > run_start {
4360 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4361 if iov.len() >= MAX_IOV {
4362 write_ioslices(writer, &iov)?;
4363 iov.clear();
4364 }
4365 }
4366 run_start = abs_pos + 1;
4367 m &= m - 1; }
4369
4370 ri += 32;
4371 }
4372
4373 while ri < len {
4375 let b = *data.get_unchecked(ri);
4376 if b >= lo && b <= hi {
4377 if ri > run_start {
4378 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4379 if iov.len() >= MAX_IOV {
4380 write_ioslices(writer, &iov)?;
4381 iov.clear();
4382 }
4383 }
4384 run_start = ri + 1;
4385 }
4386 ri += 1;
4387 }
4388
4389 if run_start < len {
4390 iov.push(std::io::IoSlice::new(&data[run_start..]));
4391 }
4392 if !iov.is_empty() {
4393 write_ioslices(writer, &iov)?;
4394 }
4395 Ok(())
4396 }
4397}
4398
4399#[cfg(target_arch = "x86_64")]
4401#[target_feature(enable = "sse2")]
4402unsafe fn delete_range_zerocopy_sse2(
4403 data: &[u8],
4404 writer: &mut impl Write,
4405 lo: u8,
4406 hi: u8,
4407) -> io::Result<()> {
4408 use std::arch::x86_64::*;
4409
4410 unsafe {
4411 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4412 let len = data.len();
4413 let mut run_start: usize = 0;
4414 let mut ri: usize = 0;
4415
4416 let range = hi - lo;
4417 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4418 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
4419 let zero = _mm_setzero_si128();
4420
4421 while ri + 16 <= len {
4422 let input = _mm_loadu_si128(data.as_ptr().add(ri) as *const _);
4423 let biased = _mm_add_epi8(input, bias_v);
4424 let gt = _mm_cmpgt_epi8(biased, threshold_v);
4425 let in_range = _mm_cmpeq_epi8(gt, zero);
4426 let del_mask = _mm_movemask_epi8(in_range) as u32 & 0xFFFF;
4427
4428 if del_mask == 0 {
4429 ri += 16;
4430 continue;
4431 }
4432
4433 let mut m = del_mask;
4434 while m != 0 {
4435 let bit = m.trailing_zeros() as usize;
4436 let abs_pos = ri + bit;
4437 if abs_pos > run_start {
4438 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4439 if iov.len() >= MAX_IOV {
4440 write_ioslices(writer, &iov)?;
4441 iov.clear();
4442 }
4443 }
4444 run_start = abs_pos + 1;
4445 m &= m - 1;
4446 }
4447
4448 ri += 16;
4449 }
4450
4451 while ri < len {
4452 let b = *data.get_unchecked(ri);
4453 if b >= lo && b <= hi {
4454 if ri > run_start {
4455 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4456 if iov.len() >= MAX_IOV {
4457 write_ioslices(writer, &iov)?;
4458 iov.clear();
4459 }
4460 }
4461 run_start = ri + 1;
4462 }
4463 ri += 1;
4464 }
4465
4466 if run_start < len {
4467 iov.push(std::io::IoSlice::new(&data[run_start..]));
4468 }
4469 if !iov.is_empty() {
4470 write_ioslices(writer, &iov)?;
4471 }
4472 Ok(())
4473 }
4474}
4475
4476#[cfg(target_arch = "aarch64")]
4480#[target_feature(enable = "neon")]
4481unsafe fn delete_range_zerocopy_neon(
4482 data: &[u8],
4483 writer: &mut impl Write,
4484 lo: u8,
4485 hi: u8,
4486) -> io::Result<()> {
4487 use std::arch::aarch64::*;
4488
4489 unsafe {
4490 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4491 let len = data.len();
4492 let mut run_start: usize = 0;
4493 let mut ri: usize = 0;
4494
4495 let lo_v = vdupq_n_u8(lo);
4496 let hi_v = vdupq_n_u8(hi);
4497 let bit_mask: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
4499 let bit_mask_v = vld1q_u8(bit_mask.as_ptr());
4500
4501 while ri + 16 <= len {
4502 let input = vld1q_u8(data.as_ptr().add(ri));
4503 let ge_lo = vcgeq_u8(input, lo_v);
4505 let le_hi = vcleq_u8(input, hi_v);
4506 let in_range = vandq_u8(ge_lo, le_hi);
4507
4508 let bits = vandq_u8(in_range, bit_mask_v);
4510 let pair = vpaddlq_u8(bits); let quad = vpaddlq_u16(pair); let octet = vpaddlq_u32(quad); let mask_lo = vgetq_lane_u64::<0>(octet) as u8;
4514 let mask_hi = vgetq_lane_u64::<1>(octet) as u8;
4515 let del_mask = (mask_hi as u16) << 8 | mask_lo as u16;
4516
4517 if del_mask == 0 {
4518 ri += 16;
4520 continue;
4521 }
4522
4523 let mut m = del_mask;
4525 while m != 0 {
4526 let bit = m.trailing_zeros() as usize;
4527 let abs_pos = ri + bit;
4528 if abs_pos > run_start {
4529 iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4530 if iov.len() >= MAX_IOV {
4531 write_ioslices(writer, &iov)?;
4532 iov.clear();
4533 }
4534 }
4535 run_start = abs_pos + 1;
4536 m &= m - 1;
4537 }
4538
4539 ri += 16;
4540 }
4541
4542 while ri < len {
4544 let b = *data.get_unchecked(ri);
4545 if b >= lo && b <= hi {
4546 if ri > run_start {
4547 iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4548 if iov.len() >= MAX_IOV {
4549 write_ioslices(writer, &iov)?;
4550 iov.clear();
4551 }
4552 }
4553 run_start = ri + 1;
4554 }
4555 ri += 1;
4556 }
4557
4558 if run_start < len {
4559 iov.push(std::io::IoSlice::new(&data[run_start..]));
4560 }
4561 if !iov.is_empty() {
4562 write_ioslices(writer, &iov)?;
4563 }
4564 Ok(())
4565 }
4566}
4567
4568#[inline]
4571fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
4572 let len = chunk.len();
4573 let mut out_pos = 0;
4574 let mut i = 0;
4575
4576 while i + 8 <= len {
4577 unsafe {
4578 let b0 = *chunk.get_unchecked(i);
4579 let b1 = *chunk.get_unchecked(i + 1);
4580 let b2 = *chunk.get_unchecked(i + 2);
4581 let b3 = *chunk.get_unchecked(i + 3);
4582 let b4 = *chunk.get_unchecked(i + 4);
4583 let b5 = *chunk.get_unchecked(i + 5);
4584 let b6 = *chunk.get_unchecked(i + 6);
4585 let b7 = *chunk.get_unchecked(i + 7);
4586
4587 *outbuf.get_unchecked_mut(out_pos) = b0;
4588 out_pos += !is_member(member, b0) as usize;
4589 *outbuf.get_unchecked_mut(out_pos) = b1;
4590 out_pos += !is_member(member, b1) as usize;
4591 *outbuf.get_unchecked_mut(out_pos) = b2;
4592 out_pos += !is_member(member, b2) as usize;
4593 *outbuf.get_unchecked_mut(out_pos) = b3;
4594 out_pos += !is_member(member, b3) as usize;
4595 *outbuf.get_unchecked_mut(out_pos) = b4;
4596 out_pos += !is_member(member, b4) as usize;
4597 *outbuf.get_unchecked_mut(out_pos) = b5;
4598 out_pos += !is_member(member, b5) as usize;
4599 *outbuf.get_unchecked_mut(out_pos) = b6;
4600 out_pos += !is_member(member, b6) as usize;
4601 *outbuf.get_unchecked_mut(out_pos) = b7;
4602 out_pos += !is_member(member, b7) as usize;
4603 }
4604 i += 8;
4605 }
4606
4607 while i < len {
4608 unsafe {
4609 let b = *chunk.get_unchecked(i);
4610 *outbuf.get_unchecked_mut(out_pos) = b;
4611 out_pos += !is_member(member, b) as usize;
4612 }
4613 i += 1;
4614 }
4615
4616 out_pos
4617}
4618
4619fn delete_bitset_zerocopy(
4624 data: &[u8],
4625 member: &[u8; 32],
4626 writer: &mut impl Write,
4627) -> io::Result<()> {
4628 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4629 let len = data.len();
4630 let mut i = 0;
4631 let mut run_start: Option<usize> = None;
4632
4633 while i < len {
4634 let b = unsafe { *data.get_unchecked(i) };
4635 if is_member(member, b) {
4636 if let Some(rs) = run_start {
4638 iov.push(std::io::IoSlice::new(&data[rs..i]));
4639 run_start = None;
4640 if iov.len() >= MAX_IOV {
4641 write_ioslices(writer, &iov)?;
4642 iov.clear();
4643 }
4644 }
4645 } else {
4646 if run_start.is_none() {
4648 run_start = Some(i);
4649 }
4650 }
4651 i += 1;
4652 }
4653 if let Some(rs) = run_start {
4655 iov.push(std::io::IoSlice::new(&data[rs..]));
4656 }
4657 if !iov.is_empty() {
4658 write_ioslices(writer, &iov)?;
4659 }
4660 Ok(())
4661}
4662
4663fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4664 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4668 let mut last = 0;
4669 for pos in memchr::memchr_iter(ch, data) {
4670 if pos > last {
4671 iov.push(std::io::IoSlice::new(&data[last..pos]));
4672 if iov.len() >= MAX_IOV {
4673 write_ioslices(writer, &iov)?;
4674 iov.clear();
4675 }
4676 }
4677 last = pos + 1;
4678 }
4679 if last < data.len() {
4680 iov.push(std::io::IoSlice::new(&data[last..]));
4681 }
4682 if !iov.is_empty() {
4683 write_ioslices(writer, &iov)?;
4684 }
4685 Ok(())
4686}
4687
4688fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4689 let c0 = chars[0];
4690 let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
4691 let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
4692 let is_three = chars.len() >= 3;
4693
4694 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4696 let mut last = 0;
4697
4698 macro_rules! process_pos {
4699 ($pos:expr) => {
4700 if $pos > last {
4701 iov.push(std::io::IoSlice::new(&data[last..$pos]));
4702 if iov.len() >= MAX_IOV {
4703 write_ioslices(writer, &iov)?;
4704 iov.clear();
4705 }
4706 }
4707 last = $pos + 1;
4708 };
4709 }
4710
4711 if is_three {
4712 for pos in memchr::memchr3_iter(c0, c1, c2, data) {
4713 process_pos!(pos);
4714 }
4715 } else {
4716 for pos in memchr::memchr2_iter(c0, c1, data) {
4717 process_pos!(pos);
4718 }
4719 }
4720 if last < data.len() {
4721 iov.push(std::io::IoSlice::new(&data[last..]));
4722 }
4723 if !iov.is_empty() {
4724 write_ioslices(writer, &iov)?;
4725 }
4726 Ok(())
4727}
4728
4729pub fn delete_squeeze_mmap(
4734 delete_chars: &[u8],
4735 squeeze_chars: &[u8],
4736 data: &[u8],
4737 writer: &mut impl Write,
4738) -> io::Result<()> {
4739 let delete_set = build_member_set(delete_chars);
4740 let squeeze_set = build_member_set(squeeze_chars);
4741
4742 let mut outbuf = alloc_uninit_vec(data.len());
4744 let mut last_squeezed: u16 = 256;
4745 let mut out_pos = 0;
4746
4747 for &b in data.iter() {
4748 if is_member(&delete_set, b) {
4749 continue;
4750 }
4751 if is_member(&squeeze_set, b) {
4752 if last_squeezed == b as u16 {
4753 continue;
4754 }
4755 last_squeezed = b as u16;
4756 } else {
4757 last_squeezed = 256;
4758 }
4759 unsafe {
4760 *outbuf.get_unchecked_mut(out_pos) = b;
4761 }
4762 out_pos += 1;
4763 }
4764 writer.write_all(&outbuf[..out_pos])
4765}
4766
4767pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4773 if squeeze_chars.len() == 1 {
4774 return squeeze_single_mmap(squeeze_chars[0], data, writer);
4775 }
4776 if squeeze_chars.len() == 2 {
4777 return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
4778 }
4779 if squeeze_chars.len() == 3 {
4780 return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
4781 }
4782
4783 let member = build_member_set(squeeze_chars);
4784
4785 if data.len() >= PARALLEL_THRESHOLD {
4787 let n_threads = rayon::current_num_threads().max(1);
4788 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4789
4790 let results: Vec<Vec<u8>> = data
4791 .par_chunks(chunk_size)
4792 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
4793 .collect();
4794
4795 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
4800 for (idx, result) in results.iter().enumerate() {
4801 if result.is_empty() {
4802 continue;
4803 }
4804 if idx > 0 {
4805 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
4807 if is_member(&member, prev_last) {
4808 let skip = result.iter().take_while(|&&b| b == prev_last).count();
4810 if skip < result.len() {
4811 slices.push(std::io::IoSlice::new(&result[skip..]));
4812 }
4813 continue;
4814 }
4815 }
4816 }
4817 slices.push(std::io::IoSlice::new(result));
4818 }
4819 return write_ioslices(writer, &slices);
4820 }
4821
4822 let mut outbuf = alloc_uninit_vec(data.len());
4824 let len = data.len();
4825 let mut wp = 0;
4826 let mut i = 0;
4827 let mut last_squeezed: u16 = 256;
4828
4829 unsafe {
4830 let inp = data.as_ptr();
4831 let outp = outbuf.as_mut_ptr();
4832
4833 while i < len {
4834 let b = *inp.add(i);
4835 if is_member(&member, b) {
4836 if last_squeezed != b as u16 {
4837 *outp.add(wp) = b;
4838 wp += 1;
4839 last_squeezed = b as u16;
4840 }
4841 i += 1;
4842 while i < len && *inp.add(i) == b {
4843 i += 1;
4844 }
4845 } else {
4846 last_squeezed = 256;
4847 *outp.add(wp) = b;
4848 wp += 1;
4849 i += 1;
4850 }
4851 }
4852 }
4853 writer.write_all(&outbuf[..wp])
4854}
4855
4856fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
4858 let len = chunk.len();
4859 let mut out = Vec::with_capacity(len);
4860 let mut last_squeezed: u16 = 256;
4861 let mut i = 0;
4862
4863 unsafe {
4864 out.set_len(len);
4865 let inp = chunk.as_ptr();
4866 let outp: *mut u8 = out.as_mut_ptr();
4867 let mut wp = 0;
4868
4869 while i < len {
4870 let b = *inp.add(i);
4871 if is_member(member, b) {
4872 if last_squeezed != b as u16 {
4873 *outp.add(wp) = b;
4874 wp += 1;
4875 last_squeezed = b as u16;
4876 }
4877 i += 1;
4878 while i < len && *inp.add(i) == b {
4879 i += 1;
4880 }
4881 } else {
4882 last_squeezed = 256;
4883 *outp.add(wp) = b;
4884 wp += 1;
4885 i += 1;
4886 }
4887 }
4888 out.set_len(wp);
4889 }
4890 out
4891}
4892
4893fn squeeze_multi_mmap<const N: usize>(
4894 chars: &[u8],
4895 data: &[u8],
4896 writer: &mut impl Write,
4897) -> io::Result<()> {
4898 if data.len() >= PARALLEL_THRESHOLD {
4900 let member = build_member_set(chars);
4901 let n_threads = rayon::current_num_threads().max(1);
4902 let chunk_size = (data.len() / n_threads).max(32 * 1024);
4903
4904 let results: Vec<Vec<u8>> = data
4905 .par_chunks(chunk_size)
4906 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
4907 .collect();
4908
4909 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
4911 for (idx, result) in results.iter().enumerate() {
4912 if result.is_empty() {
4913 continue;
4914 }
4915 if idx > 0 {
4916 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
4917 if is_member(&member, prev_last) {
4918 let skip = result.iter().take_while(|&&b| b == prev_last).count();
4919 if skip < result.len() {
4920 slices.push(std::io::IoSlice::new(&result[skip..]));
4921 }
4922 continue;
4923 }
4924 }
4925 }
4926 slices.push(std::io::IoSlice::new(result));
4927 }
4928 return write_ioslices(writer, &slices);
4929 }
4930
4931 let single = [chars[0]; 1]; let _ = single;
4937 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(1024);
4938 let mut cursor = 0;
4939 let mut last_squeezed: u16 = 256;
4940
4941 macro_rules! find_next {
4942 ($data:expr) => {
4943 if N == 2 {
4944 memchr::memchr2(chars[0], chars[1], $data)
4945 } else {
4946 memchr::memchr3(chars[0], chars[1], chars[2], $data)
4947 }
4948 };
4949 }
4950
4951 while cursor < data.len() {
4952 match find_next!(&data[cursor..]) {
4953 Some(offset) => {
4954 let pos = cursor + offset;
4955 let b = data[pos];
4956 if pos > cursor {
4958 iov.push(std::io::IoSlice::new(&data[cursor..pos]));
4959 last_squeezed = 256;
4960 }
4961 if last_squeezed != b as u16 {
4963 iov.push(std::io::IoSlice::new(&data[pos..pos + 1]));
4965 last_squeezed = b as u16;
4966 }
4967 let mut skip = pos + 1;
4969 while skip < data.len() && data[skip] == b {
4970 skip += 1;
4971 }
4972 cursor = skip;
4973 if iov.len() >= MAX_IOV {
4975 write_ioslices(writer, &iov)?;
4976 iov.clear();
4977 }
4978 }
4979 None => {
4980 if cursor < data.len() {
4981 iov.push(std::io::IoSlice::new(&data[cursor..]));
4982 }
4983 break;
4984 }
4985 }
4986 }
4987 if !iov.is_empty() {
4988 write_ioslices(writer, &iov)?;
4989 }
4990 Ok(())
4991}
4992
4993fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4994 if data.is_empty() {
4995 return Ok(());
4996 }
4997
4998 let pair = [ch, ch];
5000 if memchr::memmem::find(data, &pair).is_none() {
5001 return writer.write_all(data);
5002 }
5003
5004 let finder = memchr::memmem::Finder::new(&pair);
5011 let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(2048);
5012 let mut cursor = 0;
5013
5014 while cursor < data.len() {
5015 match finder.find(&data[cursor..]) {
5016 Some(offset) => {
5017 let pair_pos = cursor + offset;
5018 let seg_end = pair_pos + 1;
5020 if seg_end > cursor {
5021 iov.push(std::io::IoSlice::new(&data[cursor..seg_end]));
5022 }
5023 let mut skip = seg_end;
5025 while skip < data.len() && data[skip] == ch {
5026 skip += 1;
5027 }
5028 cursor = skip;
5029 if iov.len() >= MAX_IOV {
5031 write_ioslices(writer, &iov)?;
5032 iov.clear();
5033 }
5034 }
5035 None => {
5036 if cursor < data.len() {
5038 iov.push(std::io::IoSlice::new(&data[cursor..]));
5039 }
5040 break;
5041 }
5042 }
5043 }
5044
5045 if !iov.is_empty() {
5046 write_ioslices(writer, &iov)?;
5047 }
5048 Ok(())
5049}