1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const MAX_IOV: usize = 1024;
8
9const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12const STREAM_BUF: usize = 16 * 1024 * 1024;
19
20const PARALLEL_THRESHOLD: usize = 16 * 1024 * 1024;
26
27#[inline]
30fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
31 if slices.is_empty() {
32 return Ok(());
33 }
34 for batch in slices.chunks(MAX_IOV) {
35 let total: usize = batch.iter().map(|s| s.len()).sum();
36 match writer.write_vectored(batch) {
37 Ok(n) if n >= total => continue,
38 Ok(mut written) => {
39 for slice in batch {
41 let slen = slice.len();
42 if written >= slen {
43 written -= slen;
44 continue;
45 }
46 if written > 0 {
47 writer.write_all(&slice[written..])?;
48 written = 0;
49 } else {
50 writer.write_all(slice)?;
51 }
52 }
53 }
54 Err(e) => return Err(e),
55 }
56 }
57 Ok(())
58}
59
60#[inline]
63#[allow(clippy::uninit_vec)]
64fn alloc_uninit_vec(len: usize) -> Vec<u8> {
65 let mut v = Vec::with_capacity(len);
66 unsafe {
68 v.set_len(len);
69 }
70 v
71}
72
73#[inline]
75fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
76 let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
77 let last = set2.last().copied();
78 for (i, &from) in set1.iter().enumerate() {
79 table[from as usize] = if i < set2.len() {
80 set2[i]
81 } else {
82 last.unwrap_or(from)
83 };
84 }
85 table
86}
87
88#[inline]
90fn build_member_set(chars: &[u8]) -> [u8; 32] {
91 let mut set = [0u8; 32];
92 for &ch in chars {
93 set[ch as usize >> 3] |= 1 << (ch & 7);
94 }
95 set
96}
97
98#[inline(always)]
99fn is_member(set: &[u8; 32], ch: u8) -> bool {
100 unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
101}
102
103#[cfg(target_arch = "x86_64")]
106static SIMD_LEVEL: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
107
108#[cfg(target_arch = "x86_64")]
109#[inline(always)]
110fn get_simd_level() -> u8 {
111 let level = SIMD_LEVEL.load(std::sync::atomic::Ordering::Relaxed);
112 if level != 0 {
113 return level;
114 }
115 let detected = if is_x86_feature_detected!("avx2") {
116 3
117 } else if is_x86_feature_detected!("ssse3") {
118 2
119 } else {
120 1
121 };
122 SIMD_LEVEL.store(detected, std::sync::atomic::Ordering::Relaxed);
123 detected
124}
125
126#[inline(always)]
130fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
131 #[cfg(target_arch = "x86_64")]
132 {
133 let level = get_simd_level();
134 if level >= 3 {
135 unsafe { translate_inplace_avx2_table(data, table) };
136 return;
137 }
138 if level >= 2 {
139 unsafe { translate_inplace_ssse3_table(data, table) };
140 return;
141 }
142 }
143 translate_inplace_scalar(data, table);
144}
145
146#[inline(always)]
148fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
149 let len = data.len();
150 let ptr = data.as_mut_ptr();
151 let mut i = 0;
152 unsafe {
153 while i + 8 <= len {
154 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
155 *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
156 *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
157 *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
158 *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
159 *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
160 *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
161 *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
162 i += 8;
163 }
164 while i < len {
165 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
166 i += 1;
167 }
168 }
169}
170
171#[cfg(target_arch = "x86_64")]
191#[target_feature(enable = "avx2")]
192unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
193 use std::arch::x86_64::*;
194
195 unsafe {
196 let len = data.len();
197 let ptr = data.as_mut_ptr();
198
199 let mut lut = [_mm256_setzero_si256(); 16];
203 for h in 0u8..16 {
204 let base = (h as usize) * 16;
205 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
206 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
208 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
209 }
210
211 let lo_mask = _mm256_set1_epi8(0x0F);
212
213 let mut i = 0;
214 while i + 32 <= len {
215 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
216 let lo_nibble = _mm256_and_si256(input, lo_mask);
217 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
218
219 let mut result = _mm256_setzero_si256();
222
223 macro_rules! do_nibble {
225 ($h:expr) => {
226 let h_val = _mm256_set1_epi8($h as i8);
227 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
228 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
229 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
230 };
231 }
232 do_nibble!(0);
233 do_nibble!(1);
234 do_nibble!(2);
235 do_nibble!(3);
236 do_nibble!(4);
237 do_nibble!(5);
238 do_nibble!(6);
239 do_nibble!(7);
240 do_nibble!(8);
241 do_nibble!(9);
242 do_nibble!(10);
243 do_nibble!(11);
244 do_nibble!(12);
245 do_nibble!(13);
246 do_nibble!(14);
247 do_nibble!(15);
248
249 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
250 i += 32;
251 }
252
253 if i + 16 <= len {
255 let lo_mask128 = _mm_set1_epi8(0x0F);
256
257 let mut lut128 = [_mm_setzero_si128(); 16];
259 for h in 0u8..16 {
260 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
261 }
262
263 let input = _mm_loadu_si128(ptr.add(i) as *const _);
264 let lo_nib = _mm_and_si128(input, lo_mask128);
265 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
266
267 let mut res = _mm_setzero_si128();
268 macro_rules! do_nibble128 {
269 ($h:expr) => {
270 let h_val = _mm_set1_epi8($h as i8);
271 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
272 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
273 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
274 };
275 }
276 do_nibble128!(0);
277 do_nibble128!(1);
278 do_nibble128!(2);
279 do_nibble128!(3);
280 do_nibble128!(4);
281 do_nibble128!(5);
282 do_nibble128!(6);
283 do_nibble128!(7);
284 do_nibble128!(8);
285 do_nibble128!(9);
286 do_nibble128!(10);
287 do_nibble128!(11);
288 do_nibble128!(12);
289 do_nibble128!(13);
290 do_nibble128!(14);
291 do_nibble128!(15);
292
293 _mm_storeu_si128(ptr.add(i) as *mut _, res);
294 i += 16;
295 }
296
297 while i < len {
299 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
300 i += 1;
301 }
302 }
303}
304
305#[cfg(target_arch = "x86_64")]
306#[target_feature(enable = "ssse3")]
307unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
308 use std::arch::x86_64::*;
309
310 unsafe {
311 let len = data.len();
312 let ptr = data.as_mut_ptr();
313
314 let mut lut = [_mm_setzero_si128(); 16];
316 for h in 0u8..16 {
317 let base = (h as usize) * 16;
318 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
319 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
320 }
321
322 let lo_mask = _mm_set1_epi8(0x0F);
323
324 let mut i = 0;
325 while i + 16 <= len {
326 let input = _mm_loadu_si128(ptr.add(i) as *const _);
327 let lo_nibble = _mm_and_si128(input, lo_mask);
328 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
329
330 let mut result = _mm_setzero_si128();
331
332 macro_rules! do_nibble {
333 ($h:expr) => {
334 let h_val = _mm_set1_epi8($h as i8);
335 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
336 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
337 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
338 };
339 }
340 do_nibble!(0);
341 do_nibble!(1);
342 do_nibble!(2);
343 do_nibble!(3);
344 do_nibble!(4);
345 do_nibble!(5);
346 do_nibble!(6);
347 do_nibble!(7);
348 do_nibble!(8);
349 do_nibble!(9);
350 do_nibble!(10);
351 do_nibble!(11);
352 do_nibble!(12);
353 do_nibble!(13);
354 do_nibble!(14);
355 do_nibble!(15);
356
357 _mm_storeu_si128(ptr.add(i) as *mut _, result);
358 i += 16;
359 }
360
361 while i < len {
363 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
364 i += 1;
365 }
366 }
367}
368
369#[inline(always)]
372fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
373 debug_assert!(dst.len() >= src.len());
374 #[cfg(target_arch = "x86_64")]
375 {
376 let level = get_simd_level();
377 if level >= 3 {
378 unsafe { translate_to_avx2_table(src, dst, table) };
379 return;
380 }
381 if level >= 2 {
382 unsafe { translate_to_ssse3_table(src, dst, table) };
383 return;
384 }
385 }
386 translate_to_scalar(src, dst, table);
387}
388
389#[inline(always)]
391fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
392 unsafe {
393 let sp = src.as_ptr();
394 let dp = dst.as_mut_ptr();
395 let len = src.len();
396 let mut i = 0;
397 while i + 8 <= len {
398 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
399 *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
400 *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
401 *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
402 *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
403 *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
404 *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
405 *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
406 i += 8;
407 }
408 while i < len {
409 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
410 i += 1;
411 }
412 }
413}
414
415#[cfg(target_arch = "x86_64")]
416#[target_feature(enable = "avx2")]
417unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
418 use std::arch::x86_64::*;
419
420 unsafe {
421 let len = src.len();
422 let sp = src.as_ptr();
423 let dp = dst.as_mut_ptr();
424
425 let mut lut = [_mm256_setzero_si256(); 16];
427 for h in 0u8..16 {
428 let base = (h as usize) * 16;
429 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
430 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
431 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
432 }
433
434 let lo_mask = _mm256_set1_epi8(0x0F);
435
436 let mut i = 0;
437 while i + 32 <= len {
438 let input = _mm256_loadu_si256(sp.add(i) as *const _);
439 let lo_nibble = _mm256_and_si256(input, lo_mask);
440 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
441
442 let mut result = _mm256_setzero_si256();
443
444 macro_rules! do_nibble {
445 ($h:expr) => {
446 let h_val = _mm256_set1_epi8($h as i8);
447 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
448 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
449 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
450 };
451 }
452 do_nibble!(0);
453 do_nibble!(1);
454 do_nibble!(2);
455 do_nibble!(3);
456 do_nibble!(4);
457 do_nibble!(5);
458 do_nibble!(6);
459 do_nibble!(7);
460 do_nibble!(8);
461 do_nibble!(9);
462 do_nibble!(10);
463 do_nibble!(11);
464 do_nibble!(12);
465 do_nibble!(13);
466 do_nibble!(14);
467 do_nibble!(15);
468
469 _mm256_storeu_si256(dp.add(i) as *mut _, result);
470 i += 32;
471 }
472
473 if i + 16 <= len {
475 let lo_mask128 = _mm_set1_epi8(0x0F);
476 let mut lut128 = [_mm_setzero_si128(); 16];
477 for h in 0u8..16 {
478 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
479 }
480
481 let input = _mm_loadu_si128(sp.add(i) as *const _);
482 let lo_nib = _mm_and_si128(input, lo_mask128);
483 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
484
485 let mut res = _mm_setzero_si128();
486 macro_rules! do_nibble128 {
487 ($h:expr) => {
488 let h_val = _mm_set1_epi8($h as i8);
489 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
490 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
491 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
492 };
493 }
494 do_nibble128!(0);
495 do_nibble128!(1);
496 do_nibble128!(2);
497 do_nibble128!(3);
498 do_nibble128!(4);
499 do_nibble128!(5);
500 do_nibble128!(6);
501 do_nibble128!(7);
502 do_nibble128!(8);
503 do_nibble128!(9);
504 do_nibble128!(10);
505 do_nibble128!(11);
506 do_nibble128!(12);
507 do_nibble128!(13);
508 do_nibble128!(14);
509 do_nibble128!(15);
510
511 _mm_storeu_si128(dp.add(i) as *mut _, res);
512 i += 16;
513 }
514
515 while i < len {
517 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
518 i += 1;
519 }
520 }
521}
522
523#[cfg(target_arch = "x86_64")]
524#[target_feature(enable = "ssse3")]
525unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
526 use std::arch::x86_64::*;
527
528 unsafe {
529 let len = src.len();
530 let sp = src.as_ptr();
531 let dp = dst.as_mut_ptr();
532
533 let mut lut = [_mm_setzero_si128(); 16];
534 for h in 0u8..16 {
535 let base = (h as usize) * 16;
536 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
537 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
538 }
539
540 let lo_mask = _mm_set1_epi8(0x0F);
541
542 let mut i = 0;
543 while i + 16 <= len {
544 let input = _mm_loadu_si128(sp.add(i) as *const _);
545 let lo_nibble = _mm_and_si128(input, lo_mask);
546 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
547
548 let mut result = _mm_setzero_si128();
549
550 macro_rules! do_nibble {
551 ($h:expr) => {
552 let h_val = _mm_set1_epi8($h as i8);
553 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
554 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
555 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
556 };
557 }
558 do_nibble!(0);
559 do_nibble!(1);
560 do_nibble!(2);
561 do_nibble!(3);
562 do_nibble!(4);
563 do_nibble!(5);
564 do_nibble!(6);
565 do_nibble!(7);
566 do_nibble!(8);
567 do_nibble!(9);
568 do_nibble!(10);
569 do_nibble!(11);
570 do_nibble!(12);
571 do_nibble!(13);
572 do_nibble!(14);
573 do_nibble!(15);
574
575 _mm_storeu_si128(dp.add(i) as *mut _, result);
576 i += 16;
577 }
578
579 while i < len {
581 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
582 i += 1;
583 }
584 }
585}
586
587#[inline]
595fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
596 let mut lo: Option<u8> = None;
597 let mut hi = 0u8;
598 let mut offset = 0i16;
599
600 for i in 0..256 {
601 if table[i] != i as u8 {
602 let diff = table[i] as i16 - i as i16;
603 match lo {
604 None => {
605 lo = Some(i as u8);
606 hi = i as u8;
607 offset = diff;
608 }
609 Some(_) => {
610 if diff != offset || i as u8 != hi.wrapping_add(1) {
611 return None;
612 }
613 hi = i as u8;
614 }
615 }
616 }
617 }
618
619 lo.map(|l| (l, hi, offset as i8))
620}
621
622#[inline]
627fn detect_range_to_constant(table: &[u8; 256]) -> Option<(u8, u8, u8)> {
628 let mut lo: Option<u8> = None;
629 let mut hi = 0u8;
630 let mut replacement = 0u8;
631
632 for i in 0..256 {
633 if table[i] != i as u8 {
634 match lo {
635 None => {
636 lo = Some(i as u8);
637 hi = i as u8;
638 replacement = table[i];
639 }
640 Some(_) => {
641 if table[i] != replacement || i as u8 != hi.wrapping_add(1) {
642 return None;
643 }
644 hi = i as u8;
645 }
646 }
647 }
648 }
649
650 lo.map(|l| (l, hi, replacement))
651}
652
653#[cfg(target_arch = "x86_64")]
658fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
659 if get_simd_level() >= 3 {
660 unsafe { translate_range_to_constant_avx2_inplace(data, lo, hi, replacement) };
661 } else {
662 unsafe { translate_range_to_constant_sse2_inplace(data, lo, hi, replacement) };
663 }
664}
665
666#[cfg(target_arch = "x86_64")]
667#[target_feature(enable = "avx2")]
668unsafe fn translate_range_to_constant_avx2_inplace(
669 data: &mut [u8],
670 lo: u8,
671 hi: u8,
672 replacement: u8,
673) {
674 use std::arch::x86_64::*;
675
676 unsafe {
677 let range = hi - lo;
678 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
679 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
680 let repl_v = _mm256_set1_epi8(replacement as i8);
681 let zero = _mm256_setzero_si256();
682
683 let len = data.len();
684 let ptr = data.as_mut_ptr();
685 let mut i = 0;
686
687 while i + 32 <= len {
688 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
689 let biased = _mm256_add_epi8(input, bias_v);
690 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
691 let in_range = _mm256_cmpeq_epi8(gt, zero);
693 let result = _mm256_blendv_epi8(input, repl_v, in_range);
695 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
696 i += 32;
697 }
698
699 if i + 16 <= len {
700 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
701 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
702 let repl_v128 = _mm_set1_epi8(replacement as i8);
703 let zero128 = _mm_setzero_si128();
704
705 let input = _mm_loadu_si128(ptr.add(i) as *const _);
706 let biased = _mm_add_epi8(input, bias_v128);
707 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
708 let in_range = _mm_cmpeq_epi8(gt, zero128);
709 let result = _mm_blendv_epi8(input, repl_v128, in_range);
710 _mm_storeu_si128(ptr.add(i) as *mut _, result);
711 i += 16;
712 }
713
714 while i < len {
715 let b = *ptr.add(i);
716 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
717 i += 1;
718 }
719 }
720}
721
722#[cfg(target_arch = "x86_64")]
723#[target_feature(enable = "sse2")]
724unsafe fn translate_range_to_constant_sse2_inplace(
725 data: &mut [u8],
726 lo: u8,
727 hi: u8,
728 replacement: u8,
729) {
730 use std::arch::x86_64::*;
731
732 unsafe {
733 let range = hi - lo;
734 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
735 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
736 let repl_v = _mm_set1_epi8(replacement as i8);
737 let zero = _mm_setzero_si128();
738
739 let len = data.len();
740 let ptr = data.as_mut_ptr();
741 let mut i = 0;
742
743 while i + 16 <= len {
744 let input = _mm_loadu_si128(ptr.add(i) as *const _);
745 let biased = _mm_add_epi8(input, bias_v);
746 let gt = _mm_cmpgt_epi8(biased, threshold_v);
747 let in_range = _mm_cmpeq_epi8(gt, zero);
749 let result = _mm_or_si128(
751 _mm_and_si128(in_range, repl_v),
752 _mm_andnot_si128(in_range, input),
753 );
754 _mm_storeu_si128(ptr.add(i) as *mut _, result);
755 i += 16;
756 }
757
758 while i < len {
759 let b = *ptr.add(i);
760 *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
761 i += 1;
762 }
763 }
764}
765
766#[cfg(not(target_arch = "x86_64"))]
767fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
768 for b in data.iter_mut() {
769 if *b >= lo && *b <= hi {
770 *b = replacement;
771 }
772 }
773}
774
775#[cfg(target_arch = "x86_64")]
779fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
780 if get_simd_level() >= 3 {
781 unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
782 } else {
783 unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
784 }
785}
786
787#[cfg(target_arch = "x86_64")]
788#[target_feature(enable = "avx2")]
789unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
790 use std::arch::x86_64::*;
791
792 unsafe {
793 let range = hi - lo;
794 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
799 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
800 let offset_v = _mm256_set1_epi8(offset);
801 let zero = _mm256_setzero_si256();
802
803 let len = src.len();
804 let mut i = 0;
805
806 while i + 32 <= len {
807 let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
808 let biased = _mm256_add_epi8(input, bias_v);
809 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
811 let mask = _mm256_cmpeq_epi8(gt, zero);
813 let offset_masked = _mm256_and_si256(mask, offset_v);
814 let result = _mm256_add_epi8(input, offset_masked);
815 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
816 i += 32;
817 }
818
819 if i + 16 <= len {
821 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
822 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
823 let offset_v128 = _mm_set1_epi8(offset);
824 let zero128 = _mm_setzero_si128();
825
826 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
827 let biased = _mm_add_epi8(input, bias_v128);
828 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
829 let mask = _mm_cmpeq_epi8(gt, zero128);
830 let offset_masked = _mm_and_si128(mask, offset_v128);
831 let result = _mm_add_epi8(input, offset_masked);
832 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
833 i += 16;
834 }
835
836 while i < len {
838 let b = *src.get_unchecked(i);
839 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
840 b.wrapping_add(offset as u8)
841 } else {
842 b
843 };
844 i += 1;
845 }
846 }
847}
848
849#[cfg(target_arch = "x86_64")]
850#[target_feature(enable = "sse2")]
851unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
852 use std::arch::x86_64::*;
853
854 unsafe {
855 let range = hi - lo;
856 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
857 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
858 let offset_v = _mm_set1_epi8(offset);
859 let zero = _mm_setzero_si128();
860
861 let len = src.len();
862 let mut i = 0;
863
864 while i + 16 <= len {
865 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
866 let biased = _mm_add_epi8(input, bias_v);
867 let gt = _mm_cmpgt_epi8(biased, threshold_v);
868 let mask = _mm_cmpeq_epi8(gt, zero);
869 let offset_masked = _mm_and_si128(mask, offset_v);
870 let result = _mm_add_epi8(input, offset_masked);
871 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
872 i += 16;
873 }
874
875 while i < len {
876 let b = *src.get_unchecked(i);
877 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
878 b.wrapping_add(offset as u8)
879 } else {
880 b
881 };
882 i += 1;
883 }
884 }
885}
886
887#[cfg(not(target_arch = "x86_64"))]
889fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
890 for (i, &b) in src.iter().enumerate() {
891 dst[i] = if b >= lo && b <= hi {
892 b.wrapping_add(offset as u8)
893 } else {
894 b
895 };
896 }
897}
898
899#[cfg(target_arch = "x86_64")]
907fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
908 if get_simd_level() >= 3 {
909 unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
910 } else {
911 unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
912 }
913}
914
915#[cfg(target_arch = "x86_64")]
916#[target_feature(enable = "avx2")]
917unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
918 use std::arch::x86_64::*;
919
920 unsafe {
921 let range = hi - lo;
922 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
923 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
924 let offset_v = _mm256_set1_epi8(offset);
925 let zero = _mm256_setzero_si256();
926
927 let len = data.len();
928 let ptr = data.as_mut_ptr();
929 let mut i = 0;
930
931 while i + 32 <= len {
932 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
933 let biased = _mm256_add_epi8(input, bias_v);
934 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
935 let mask = _mm256_cmpeq_epi8(gt, zero);
936 let offset_masked = _mm256_and_si256(mask, offset_v);
937 let result = _mm256_add_epi8(input, offset_masked);
938 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
939 i += 32;
940 }
941
942 if i + 16 <= len {
943 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
944 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
945 let offset_v128 = _mm_set1_epi8(offset);
946 let zero128 = _mm_setzero_si128();
947
948 let input = _mm_loadu_si128(ptr.add(i) as *const _);
949 let biased = _mm_add_epi8(input, bias_v128);
950 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
951 let mask = _mm_cmpeq_epi8(gt, zero128);
952 let offset_masked = _mm_and_si128(mask, offset_v128);
953 let result = _mm_add_epi8(input, offset_masked);
954 _mm_storeu_si128(ptr.add(i) as *mut _, result);
955 i += 16;
956 }
957
958 while i < len {
959 let b = *ptr.add(i);
960 *ptr.add(i) = if b >= lo && b <= hi {
961 b.wrapping_add(offset as u8)
962 } else {
963 b
964 };
965 i += 1;
966 }
967 }
968}
969
970#[cfg(target_arch = "x86_64")]
971#[target_feature(enable = "sse2")]
972unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
973 use std::arch::x86_64::*;
974
975 unsafe {
976 let range = hi - lo;
977 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
978 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
979 let offset_v = _mm_set1_epi8(offset);
980 let zero = _mm_setzero_si128();
981
982 let len = data.len();
983 let ptr = data.as_mut_ptr();
984 let mut i = 0;
985
986 while i + 16 <= len {
987 let input = _mm_loadu_si128(ptr.add(i) as *const _);
988 let biased = _mm_add_epi8(input, bias_v);
989 let gt = _mm_cmpgt_epi8(biased, threshold_v);
990 let mask = _mm_cmpeq_epi8(gt, zero);
991 let offset_masked = _mm_and_si128(mask, offset_v);
992 let result = _mm_add_epi8(input, offset_masked);
993 _mm_storeu_si128(ptr.add(i) as *mut _, result);
994 i += 16;
995 }
996
997 while i < len {
998 let b = *ptr.add(i);
999 *ptr.add(i) = if b >= lo && b <= hi {
1000 b.wrapping_add(offset as u8)
1001 } else {
1002 b
1003 };
1004 i += 1;
1005 }
1006 }
1007}
1008
1009#[cfg(not(target_arch = "x86_64"))]
1010fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1011 for b in data.iter_mut() {
1012 if *b >= lo && *b <= hi {
1013 *b = b.wrapping_add(offset as u8);
1014 }
1015 }
1016}
1017
1018#[inline]
1028fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
1029 if chars.is_empty() {
1030 return None;
1031 }
1032 let mut lo = chars[0];
1033 let mut hi = chars[0];
1034 for &c in &chars[1..] {
1035 if c < lo {
1036 lo = c;
1037 }
1038 if c > hi {
1039 hi = c;
1040 }
1041 }
1042 if (hi - lo + 1) as usize == chars.len() {
1044 Some((lo, hi))
1045 } else {
1046 None
1047 }
1048}
1049
1050#[cfg(target_arch = "x86_64")]
1054fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1055 if get_simd_level() >= 3 {
1056 unsafe { delete_range_avx2(src, dst, lo, hi) }
1057 } else {
1058 unsafe { delete_range_sse2(src, dst, lo, hi) }
1059 }
1060}
1061
1062#[cfg(target_arch = "x86_64")]
1063#[target_feature(enable = "avx2")]
1064unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1065 use std::arch::x86_64::*;
1066
1067 unsafe {
1068 let range = hi - lo;
1069 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1070 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1071 let zero = _mm256_setzero_si256();
1072
1073 let len = src.len();
1074 let sp = src.as_ptr();
1075 let dp = dst.as_mut_ptr();
1076 let mut ri = 0;
1077 let mut wp = 0;
1078
1079 while ri + 32 <= len {
1080 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
1081 let biased = _mm256_add_epi8(input, bias_v);
1082 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1084 let in_range = _mm256_cmpeq_epi8(gt, zero);
1086 let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
1088
1089 if keep_mask == 0xFFFFFFFF {
1090 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
1092 wp += 32;
1093 } else if keep_mask != 0 {
1094 compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
1096 let c0 = (keep_mask as u8).count_ones() as usize;
1097 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
1098 let c1 = ((keep_mask >> 8) as u8).count_ones() as usize;
1099 compact_8bytes(
1100 sp.add(ri + 16),
1101 dp.add(wp + c0 + c1),
1102 (keep_mask >> 16) as u8,
1103 );
1104 let c2 = ((keep_mask >> 16) as u8).count_ones() as usize;
1105 compact_8bytes(
1106 sp.add(ri + 24),
1107 dp.add(wp + c0 + c1 + c2),
1108 (keep_mask >> 24) as u8,
1109 );
1110 let c3 = ((keep_mask >> 24) as u8).count_ones() as usize;
1111 wp += c0 + c1 + c2 + c3;
1112 }
1113 ri += 32;
1115 }
1116
1117 if ri + 16 <= len {
1119 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1120 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1121 let zero128 = _mm_setzero_si128();
1122
1123 let input = _mm_loadu_si128(sp.add(ri) as *const _);
1124 let biased = _mm_add_epi8(input, bias_v128);
1125 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1126 let in_range = _mm_cmpeq_epi8(gt, zero128);
1127 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
1128
1129 if keep_mask == 0xFFFF {
1130 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
1131 wp += 16;
1132 } else if keep_mask != 0 {
1133 compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
1134 let c0 = (keep_mask as u8).count_ones() as usize;
1135 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
1136 wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
1137 }
1138 ri += 16;
1139 }
1140
1141 while ri < len {
1143 let b = *sp.add(ri);
1144 if b < lo || b > hi {
1145 *dp.add(wp) = b;
1146 wp += 1;
1147 }
1148 ri += 1;
1149 }
1150
1151 wp
1152 }
1153}
1154
1155#[cfg(target_arch = "x86_64")]
1160#[inline(always)]
1161unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
1162 unsafe {
1167 let mut m = mask;
1168 let mut w = 0;
1169 while m != 0 {
1170 let bit = m.trailing_zeros() as usize;
1171 *dst.add(w) = *src.add(bit);
1172 w += 1;
1173 m &= m - 1;
1174 }
1175 }
1176}
1177
1178#[cfg(target_arch = "x86_64")]
1179#[target_feature(enable = "sse2")]
1180unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1181 use std::arch::x86_64::*;
1182
1183 unsafe {
1184 let range = hi - lo;
1185 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1186 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1187 let zero = _mm_setzero_si128();
1188
1189 let len = src.len();
1190 let sp = src.as_ptr();
1191 let dp = dst.as_mut_ptr();
1192 let mut ri = 0;
1193 let mut wp = 0;
1194
1195 while ri + 16 <= len {
1196 let input = _mm_loadu_si128(sp.add(ri) as *const _);
1197 let biased = _mm_add_epi8(input, bias_v);
1198 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1199 let in_range = _mm_cmpeq_epi8(gt, zero);
1200 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
1201
1202 if keep_mask == 0xFFFF {
1203 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
1205 wp += 16;
1206 } else if keep_mask != 0 {
1207 compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
1208 let c0 = (keep_mask as u8).count_ones() as usize;
1209 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
1210 wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
1211 }
1212 ri += 16;
1213 }
1214
1215 while ri < len {
1216 let b = *sp.add(ri);
1217 if b < lo || b > hi {
1218 *dp.add(wp) = b;
1219 wp += 1;
1220 }
1221 ri += 1;
1222 }
1223
1224 wp
1225 }
1226}
1227
1228#[cfg(not(target_arch = "x86_64"))]
1230fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1231 let mut wp = 0;
1232 for &b in src {
1233 if b < lo || b > hi {
1234 dst[wp] = b;
1235 wp += 1;
1236 }
1237 }
1238 wp
1239}
1240
1241fn delete_range_streaming(
1246 lo: u8,
1247 hi: u8,
1248 reader: &mut impl Read,
1249 writer: &mut impl Write,
1250) -> io::Result<()> {
1251 let mut src = alloc_uninit_vec(STREAM_BUF);
1252 let mut dst = alloc_uninit_vec(STREAM_BUF);
1253 loop {
1254 let n = read_full(reader, &mut src)?;
1255 if n == 0 {
1256 break;
1257 }
1258 let wp = delete_range_chunk(&src[..n], &mut dst, lo, hi);
1259 if wp == n {
1260 writer.write_all(&src[..n])?;
1262 } else if wp > 0 {
1263 writer.write_all(&dst[..wp])?;
1264 }
1265 }
1266 Ok(())
1267}
1268
1269pub fn translate(
1274 set1: &[u8],
1275 set2: &[u8],
1276 reader: &mut impl Read,
1277 writer: &mut impl Write,
1278) -> io::Result<()> {
1279 let table = build_translate_table(set1, set2);
1280
1281 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1283 if is_identity {
1284 return passthrough_stream(reader, writer);
1285 }
1286
1287 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1289 return translate_range_stream(lo, hi, offset, reader, writer);
1290 }
1291
1292 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
1295 return translate_range_to_constant_stream(lo, hi, replacement, reader, writer);
1296 }
1297
1298 let mut buf = alloc_uninit_vec(STREAM_BUF);
1307 loop {
1308 let n = read_full(reader, &mut buf)?;
1309 if n == 0 {
1310 break;
1311 }
1312 translate_inplace(&mut buf[..n], &table);
1313 writer.write_all(&buf[..n])?;
1314 }
1315 Ok(())
1316}
1317
1318fn translate_range_stream(
1321 lo: u8,
1322 hi: u8,
1323 offset: i8,
1324 reader: &mut impl Read,
1325 writer: &mut impl Write,
1326) -> io::Result<()> {
1327 let mut buf = alloc_uninit_vec(STREAM_BUF);
1328 loop {
1329 let n = read_full(reader, &mut buf)?;
1330 if n == 0 {
1331 break;
1332 }
1333 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1334 writer.write_all(&buf[..n])?;
1335 }
1336 Ok(())
1337}
1338
1339fn translate_range_to_constant_stream(
1342 lo: u8,
1343 hi: u8,
1344 replacement: u8,
1345 reader: &mut impl Read,
1346 writer: &mut impl Write,
1347) -> io::Result<()> {
1348 let mut buf = alloc_uninit_vec(STREAM_BUF);
1349 loop {
1350 let n = read_full(reader, &mut buf)?;
1351 if n == 0 {
1352 break;
1353 }
1354 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
1355 writer.write_all(&buf[..n])?;
1356 }
1357 Ok(())
1358}
1359
1360fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1363 let mut buf = alloc_uninit_vec(STREAM_BUF);
1364 loop {
1365 let n = read_full(reader, &mut buf)?;
1366 if n == 0 {
1367 break;
1368 }
1369 writer.write_all(&buf[..n])?;
1370 }
1371 Ok(())
1372}
1373
1374#[inline]
1377fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
1378 let n = reader.read(buf)?;
1380 if n == buf.len() || n == 0 {
1381 return Ok(n);
1382 }
1383 let mut total = n;
1385 while total < buf.len() {
1386 match reader.read(&mut buf[total..]) {
1387 Ok(0) => break,
1388 Ok(n) => total += n,
1389 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1390 Err(e) => return Err(e),
1391 }
1392 }
1393 Ok(total)
1394}
1395
1396pub fn translate_squeeze(
1397 set1: &[u8],
1398 set2: &[u8],
1399 reader: &mut impl Read,
1400 writer: &mut impl Write,
1401) -> io::Result<()> {
1402 let table = build_translate_table(set1, set2);
1403 let squeeze_set = build_member_set(set2);
1404
1405 let range_info = detect_range_offset(&table);
1411 let range_const_info = if range_info.is_none() {
1412 detect_range_to_constant(&table)
1413 } else {
1414 None
1415 };
1416
1417 let mut buf = alloc_uninit_vec(STREAM_BUF);
1418 let mut last_squeezed: u16 = 256;
1419
1420 loop {
1421 let n = read_full(reader, &mut buf)?;
1422 if n == 0 {
1423 break;
1424 }
1425 if let Some((lo, hi, offset)) = range_info {
1427 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1428 } else if let Some((lo, hi, replacement)) = range_const_info {
1429 translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
1430 } else {
1431 translate_inplace(&mut buf[..n], &table);
1432 }
1433 let mut wp = 0;
1435 unsafe {
1436 let ptr = buf.as_mut_ptr();
1437 for i in 0..n {
1438 let b = *ptr.add(i);
1439 if is_member(&squeeze_set, b) {
1440 if last_squeezed == b as u16 {
1441 continue;
1442 }
1443 last_squeezed = b as u16;
1444 } else {
1445 last_squeezed = 256;
1446 }
1447 *ptr.add(wp) = b;
1448 wp += 1;
1449 }
1450 }
1451 writer.write_all(&buf[..wp])?;
1452 }
1453 Ok(())
1454}
1455
1456pub fn delete(
1457 delete_chars: &[u8],
1458 reader: &mut impl Read,
1459 writer: &mut impl Write,
1460) -> io::Result<()> {
1461 if delete_chars.len() == 1 {
1462 return delete_single_streaming(delete_chars[0], reader, writer);
1463 }
1464 if delete_chars.len() <= 3 {
1465 return delete_multi_streaming(delete_chars, reader, writer);
1466 }
1467
1468 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1472 return delete_range_streaming(lo, hi, reader, writer);
1473 }
1474
1475 let member = build_member_set(delete_chars);
1476 let mut buf = alloc_uninit_vec(STREAM_BUF);
1477
1478 loop {
1479 let n = read_full(reader, &mut buf)?;
1480 if n == 0 {
1481 break;
1482 }
1483 let mut wp = 0;
1484 unsafe {
1485 let ptr = buf.as_mut_ptr();
1486 let mut i = 0;
1487 while i + 8 <= n {
1488 let b0 = *ptr.add(i);
1489 let b1 = *ptr.add(i + 1);
1490 let b2 = *ptr.add(i + 2);
1491 let b3 = *ptr.add(i + 3);
1492 let b4 = *ptr.add(i + 4);
1493 let b5 = *ptr.add(i + 5);
1494 let b6 = *ptr.add(i + 6);
1495 let b7 = *ptr.add(i + 7);
1496
1497 *ptr.add(wp) = b0;
1500 wp += !is_member(&member, b0) as usize;
1501 *ptr.add(wp) = b1;
1502 wp += !is_member(&member, b1) as usize;
1503 *ptr.add(wp) = b2;
1504 wp += !is_member(&member, b2) as usize;
1505 *ptr.add(wp) = b3;
1506 wp += !is_member(&member, b3) as usize;
1507 *ptr.add(wp) = b4;
1508 wp += !is_member(&member, b4) as usize;
1509 *ptr.add(wp) = b5;
1510 wp += !is_member(&member, b5) as usize;
1511 *ptr.add(wp) = b6;
1512 wp += !is_member(&member, b6) as usize;
1513 *ptr.add(wp) = b7;
1514 wp += !is_member(&member, b7) as usize;
1515 i += 8;
1516 }
1517 while i < n {
1518 let b = *ptr.add(i);
1519 *ptr.add(wp) = b;
1520 wp += !is_member(&member, b) as usize;
1521 i += 1;
1522 }
1523 }
1524 writer.write_all(&buf[..wp])?;
1525 }
1526 Ok(())
1527}
1528
1529fn delete_single_streaming(
1530 ch: u8,
1531 reader: &mut impl Read,
1532 writer: &mut impl Write,
1533) -> io::Result<()> {
1534 let mut src = alloc_uninit_vec(STREAM_BUF);
1535 let mut dst = alloc_uninit_vec(STREAM_BUF);
1536 loop {
1537 let n = read_full(reader, &mut src)?;
1538 if n == 0 {
1539 break;
1540 }
1541 let mut wp = 0;
1545 let mut i = 0;
1546 while i < n {
1547 match memchr::memchr(ch, &src[i..n]) {
1548 Some(offset) => {
1549 if offset > 0 {
1550 unsafe {
1551 std::ptr::copy_nonoverlapping(
1552 src.as_ptr().add(i),
1553 dst.as_mut_ptr().add(wp),
1554 offset,
1555 );
1556 }
1557 wp += offset;
1558 }
1559 i += offset + 1;
1560 }
1561 None => {
1562 let run_len = n - i;
1563 if run_len > 0 {
1564 unsafe {
1565 std::ptr::copy_nonoverlapping(
1566 src.as_ptr().add(i),
1567 dst.as_mut_ptr().add(wp),
1568 run_len,
1569 );
1570 }
1571 wp += run_len;
1572 }
1573 break;
1574 }
1575 }
1576 }
1577 if wp == n {
1579 writer.write_all(&src[..n])?;
1580 } else if wp > 0 {
1581 writer.write_all(&dst[..wp])?;
1582 }
1583 }
1584 Ok(())
1585}
1586
1587fn delete_multi_streaming(
1588 chars: &[u8],
1589 reader: &mut impl Read,
1590 writer: &mut impl Write,
1591) -> io::Result<()> {
1592 let mut src = alloc_uninit_vec(STREAM_BUF);
1593 let mut dst = alloc_uninit_vec(STREAM_BUF);
1594 loop {
1595 let n = read_full(reader, &mut src)?;
1596 if n == 0 {
1597 break;
1598 }
1599 let mut wp = 0;
1602 let mut i = 0;
1603 while i < n {
1604 let found = if chars.len() == 2 {
1605 memchr::memchr2(chars[0], chars[1], &src[i..n])
1606 } else {
1607 memchr::memchr3(chars[0], chars[1], chars[2], &src[i..n])
1608 };
1609 match found {
1610 Some(offset) => {
1611 if offset > 0 {
1612 unsafe {
1613 std::ptr::copy_nonoverlapping(
1614 src.as_ptr().add(i),
1615 dst.as_mut_ptr().add(wp),
1616 offset,
1617 );
1618 }
1619 wp += offset;
1620 }
1621 i += offset + 1;
1622 }
1623 None => {
1624 let run_len = n - i;
1625 if run_len > 0 {
1626 unsafe {
1627 std::ptr::copy_nonoverlapping(
1628 src.as_ptr().add(i),
1629 dst.as_mut_ptr().add(wp),
1630 run_len,
1631 );
1632 }
1633 wp += run_len;
1634 }
1635 break;
1636 }
1637 }
1638 }
1639 if wp == n {
1640 writer.write_all(&src[..n])?;
1641 } else if wp > 0 {
1642 writer.write_all(&dst[..wp])?;
1643 }
1644 }
1645 Ok(())
1646}
1647
1648pub fn delete_squeeze(
1649 delete_chars: &[u8],
1650 squeeze_chars: &[u8],
1651 reader: &mut impl Read,
1652 writer: &mut impl Write,
1653) -> io::Result<()> {
1654 let delete_set = build_member_set(delete_chars);
1655 let squeeze_set = build_member_set(squeeze_chars);
1656 let mut buf = alloc_uninit_vec(STREAM_BUF);
1657 let mut last_squeezed: u16 = 256;
1658
1659 loop {
1660 let n = read_full(reader, &mut buf)?;
1661 if n == 0 {
1662 break;
1663 }
1664 let mut wp = 0;
1665 unsafe {
1666 let ptr = buf.as_mut_ptr();
1667 for i in 0..n {
1668 let b = *ptr.add(i);
1669 if is_member(&delete_set, b) {
1670 continue;
1671 }
1672 if is_member(&squeeze_set, b) {
1673 if last_squeezed == b as u16 {
1674 continue;
1675 }
1676 last_squeezed = b as u16;
1677 } else {
1678 last_squeezed = 256;
1679 }
1680 *ptr.add(wp) = b;
1681 wp += 1;
1682 }
1683 }
1684 writer.write_all(&buf[..wp])?;
1685 }
1686 Ok(())
1687}
1688
1689pub fn squeeze(
1690 squeeze_chars: &[u8],
1691 reader: &mut impl Read,
1692 writer: &mut impl Write,
1693) -> io::Result<()> {
1694 if squeeze_chars.len() == 1 {
1695 return squeeze_single_stream(squeeze_chars[0], reader, writer);
1696 }
1697
1698 let member = build_member_set(squeeze_chars);
1699 let mut buf = alloc_uninit_vec(STREAM_BUF);
1700 let mut last_squeezed: u16 = 256;
1701
1702 loop {
1703 let n = read_full(reader, &mut buf)?;
1704 if n == 0 {
1705 break;
1706 }
1707 let mut wp = 0;
1708 unsafe {
1709 let ptr = buf.as_mut_ptr();
1710 for i in 0..n {
1711 let b = *ptr.add(i);
1712 if is_member(&member, b) {
1713 if last_squeezed == b as u16 {
1714 continue;
1715 }
1716 last_squeezed = b as u16;
1717 } else {
1718 last_squeezed = 256;
1719 }
1720 *ptr.add(wp) = b;
1721 wp += 1;
1722 }
1723 }
1724 writer.write_all(&buf[..wp])?;
1725 }
1726 Ok(())
1727}
1728
1729fn squeeze_single_stream(
1730 ch: u8,
1731 reader: &mut impl Read,
1732 writer: &mut impl Write,
1733) -> io::Result<()> {
1734 let pair = [ch, ch];
1739 let finder = memchr::memmem::Finder::new(&pair);
1740 let mut buf = alloc_uninit_vec(STREAM_BUF);
1741 let mut was_squeeze_char = false;
1742
1743 loop {
1744 let n = read_full(reader, &mut buf)?;
1745 if n == 0 {
1746 break;
1747 }
1748
1749 let ptr = buf.as_mut_ptr();
1750 let mut wp = 0;
1751 let mut i = 0;
1752
1753 if was_squeeze_char {
1756 while i < n && unsafe { *ptr.add(i) } == ch {
1757 i += 1;
1758 }
1759 if i >= n {
1760 continue;
1763 }
1764 }
1765
1766 loop {
1769 match finder.find(&buf[i..n]) {
1770 Some(offset) => {
1771 let copy_len = offset + 1; if copy_len > 0 && wp != i {
1774 unsafe {
1775 std::ptr::copy(ptr.add(i), ptr.add(wp), copy_len);
1776 }
1777 }
1778 wp += copy_len;
1779 i += offset + 1; while i < n && unsafe { *ptr.add(i) } == ch {
1782 i += 1;
1783 }
1784 if i >= n {
1785 was_squeeze_char = true;
1786 break;
1787 }
1788 }
1789 None => {
1790 let run_len = n - i;
1792 if run_len > 0 && wp != i {
1793 unsafe {
1794 std::ptr::copy(ptr.add(i), ptr.add(wp), run_len);
1795 }
1796 }
1797 wp += run_len;
1798 was_squeeze_char = n > 0 && unsafe { *ptr.add(n - 1) } == ch;
1800 break;
1801 }
1802 }
1803 }
1804
1805 writer.write_all(&buf[..wp])?;
1806 }
1807 Ok(())
1808}
1809
1810pub fn translate_owned(
1818 set1: &[u8],
1819 set2: &[u8],
1820 data: &mut [u8],
1821 writer: &mut impl Write,
1822) -> io::Result<()> {
1823 let table = build_translate_table(set1, set2);
1824
1825 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1827 if is_identity {
1828 return writer.write_all(data);
1829 }
1830
1831 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1833 if data.len() >= PARALLEL_THRESHOLD {
1834 let n_threads = rayon::current_num_threads().max(1);
1835 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1836 data.par_chunks_mut(chunk_size).for_each(|chunk| {
1837 translate_range_simd_inplace(chunk, lo, hi, offset);
1838 });
1839 } else {
1840 translate_range_simd_inplace(data, lo, hi, offset);
1841 }
1842 return writer.write_all(data);
1843 }
1844
1845 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
1847 if data.len() >= PARALLEL_THRESHOLD {
1848 let n_threads = rayon::current_num_threads().max(1);
1849 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1850 data.par_chunks_mut(chunk_size).for_each(|chunk| {
1851 translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
1852 });
1853 } else {
1854 translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
1855 }
1856 return writer.write_all(data);
1857 }
1858
1859 if data.len() >= PARALLEL_THRESHOLD {
1861 let n_threads = rayon::current_num_threads().max(1);
1862 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1863 data.par_chunks_mut(chunk_size).for_each(|chunk| {
1864 translate_inplace(chunk, &table);
1865 });
1866 } else {
1867 translate_inplace(data, &table);
1868 }
1869 writer.write_all(data)
1870}
1871
1872const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
1880
1881pub fn translate_mmap(
1890 set1: &[u8],
1891 set2: &[u8],
1892 data: &[u8],
1893 writer: &mut impl Write,
1894) -> io::Result<()> {
1895 let table = build_translate_table(set1, set2);
1896
1897 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1899 if is_identity {
1900 return writer.write_all(data);
1901 }
1902
1903 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1905 return translate_mmap_range(data, writer, lo, hi, offset);
1906 }
1907
1908 if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
1910 return translate_mmap_range_to_constant(data, writer, lo, hi, replacement);
1911 }
1912
1913 translate_mmap_table(data, writer, &table)
1915}
1916
1917fn translate_mmap_range(
1919 data: &[u8],
1920 writer: &mut impl Write,
1921 lo: u8,
1922 hi: u8,
1923 offset: i8,
1924) -> io::Result<()> {
1925 if data.len() >= PARALLEL_THRESHOLD {
1927 let mut buf = alloc_uninit_vec(data.len());
1928 let n_threads = rayon::current_num_threads().max(1);
1929 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1930
1931 data.par_chunks(chunk_size)
1933 .zip(buf.par_chunks_mut(chunk_size))
1934 .for_each(|(src_chunk, dst_chunk)| {
1935 translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
1936 });
1937
1938 return writer.write_all(&buf);
1939 }
1940
1941 if data.len() <= SINGLE_WRITE_LIMIT {
1943 let mut buf = alloc_uninit_vec(data.len());
1944 translate_range_simd(data, &mut buf, lo, hi, offset);
1945 return writer.write_all(&buf);
1946 }
1947 let mut buf = alloc_uninit_vec(BUF_SIZE);
1949 for chunk in data.chunks(BUF_SIZE) {
1950 translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
1951 writer.write_all(&buf[..chunk.len()])?;
1952 }
1953 Ok(())
1954}
1955
1956fn translate_mmap_range_to_constant(
1959 data: &[u8],
1960 writer: &mut impl Write,
1961 lo: u8,
1962 hi: u8,
1963 replacement: u8,
1964) -> io::Result<()> {
1965 if data.len() >= PARALLEL_THRESHOLD {
1967 let mut buf = alloc_uninit_vec(data.len());
1968 let n_threads = rayon::current_num_threads().max(1);
1969 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1970
1971 data.par_chunks(chunk_size)
1973 .zip(buf.par_chunks_mut(chunk_size))
1974 .for_each(|(src_chunk, dst_chunk)| {
1975 dst_chunk[..src_chunk.len()].copy_from_slice(src_chunk);
1976 translate_range_to_constant_simd_inplace(
1977 &mut dst_chunk[..src_chunk.len()],
1978 lo,
1979 hi,
1980 replacement,
1981 );
1982 });
1983
1984 return writer.write_all(&buf);
1985 }
1986
1987 if data.len() <= SINGLE_WRITE_LIMIT {
1988 let mut buf = data.to_vec();
1989 translate_range_to_constant_simd_inplace(&mut buf, lo, hi, replacement);
1990 return writer.write_all(&buf);
1991 }
1992 let mut buf = alloc_uninit_vec(BUF_SIZE);
1993 for chunk in data.chunks(BUF_SIZE) {
1994 buf[..chunk.len()].copy_from_slice(chunk);
1995 translate_range_to_constant_simd_inplace(&mut buf[..chunk.len()], lo, hi, replacement);
1996 writer.write_all(&buf[..chunk.len()])?;
1997 }
1998 Ok(())
1999}
2000
2001fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
2003 if data.len() >= PARALLEL_THRESHOLD {
2005 let mut buf = alloc_uninit_vec(data.len());
2006 let n_threads = rayon::current_num_threads().max(1);
2007 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2008
2009 data.par_chunks(chunk_size)
2010 .zip(buf.par_chunks_mut(chunk_size))
2011 .for_each(|(src_chunk, dst_chunk)| {
2012 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
2013 });
2014
2015 return writer.write_all(&buf);
2016 }
2017
2018 if data.len() <= SINGLE_WRITE_LIMIT {
2020 let mut buf = alloc_uninit_vec(data.len());
2021 translate_to(data, &mut buf, table);
2022 return writer.write_all(&buf);
2023 }
2024 let mut buf = alloc_uninit_vec(BUF_SIZE);
2025 for chunk in data.chunks(BUF_SIZE) {
2026 translate_to(chunk, &mut buf[..chunk.len()], table);
2027 writer.write_all(&buf[..chunk.len()])?;
2028 }
2029 Ok(())
2030}
2031
2032pub fn translate_squeeze_mmap(
2038 set1: &[u8],
2039 set2: &[u8],
2040 data: &[u8],
2041 writer: &mut impl Write,
2042) -> io::Result<()> {
2043 let table = build_translate_table(set1, set2);
2044 let squeeze_set = build_member_set(set2);
2045
2046 if data.len() >= PARALLEL_THRESHOLD {
2051 let mut translated = alloc_uninit_vec(data.len());
2053 let range_info = detect_range_offset(&table);
2054 let n_threads = rayon::current_num_threads().max(1);
2055 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2056
2057 if let Some((lo, hi, offset)) = range_info {
2058 data.par_chunks(chunk_size)
2059 .zip(translated.par_chunks_mut(chunk_size))
2060 .for_each(|(src_chunk, dst_chunk)| {
2061 translate_range_simd(
2062 src_chunk,
2063 &mut dst_chunk[..src_chunk.len()],
2064 lo,
2065 hi,
2066 offset,
2067 );
2068 });
2069 } else {
2070 data.par_chunks(chunk_size)
2071 .zip(translated.par_chunks_mut(chunk_size))
2072 .for_each(|(src_chunk, dst_chunk)| {
2073 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
2074 });
2075 }
2076
2077 let mut last_squeezed: u16 = 256;
2081 let len = translated.len();
2082 let mut wp = 0;
2083 unsafe {
2084 let ptr = translated.as_mut_ptr();
2085 let mut i = 0;
2086 while i < len {
2087 let b = *ptr.add(i);
2088 if is_member(&squeeze_set, b) {
2089 if last_squeezed == b as u16 {
2090 i += 1;
2091 continue;
2092 }
2093 last_squeezed = b as u16;
2094 } else {
2095 last_squeezed = 256;
2096 }
2097 *ptr.add(wp) = b;
2098 wp += 1;
2099 i += 1;
2100 }
2101 }
2102 return writer.write_all(&translated[..wp]);
2103 }
2104
2105 if data.len() <= SINGLE_WRITE_LIMIT {
2107 let mut buf: Vec<u8> = Vec::with_capacity(data.len());
2108 let mut last_squeezed: u16 = 256;
2109 unsafe {
2110 buf.set_len(data.len());
2111 let outp: *mut u8 = buf.as_mut_ptr();
2112 let inp = data.as_ptr();
2113 let len = data.len();
2114 let mut wp = 0;
2115 let mut i = 0;
2116 while i < len {
2117 let translated = *table.get_unchecked(*inp.add(i) as usize);
2118 if is_member(&squeeze_set, translated) {
2119 if last_squeezed == translated as u16 {
2120 i += 1;
2121 continue;
2122 }
2123 last_squeezed = translated as u16;
2124 } else {
2125 last_squeezed = 256;
2126 }
2127 *outp.add(wp) = translated;
2128 wp += 1;
2129 i += 1;
2130 }
2131 buf.set_len(wp);
2132 }
2133 return writer.write_all(&buf);
2134 }
2135
2136 let buf_size = data.len().min(BUF_SIZE);
2138 let mut buf = vec![0u8; buf_size];
2139 let mut last_squeezed: u16 = 256;
2140
2141 for chunk in data.chunks(buf_size) {
2142 translate_to(chunk, &mut buf[..chunk.len()], &table);
2143 let mut wp = 0;
2144 unsafe {
2145 let ptr = buf.as_mut_ptr();
2146 for i in 0..chunk.len() {
2147 let b = *ptr.add(i);
2148 if is_member(&squeeze_set, b) {
2149 if last_squeezed == b as u16 {
2150 continue;
2151 }
2152 last_squeezed = b as u16;
2153 } else {
2154 last_squeezed = 256;
2155 }
2156 *ptr.add(wp) = b;
2157 wp += 1;
2158 }
2159 }
2160 writer.write_all(&buf[..wp])?;
2161 }
2162 Ok(())
2163}
2164
2165pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2171 if delete_chars.len() == 1 {
2172 return delete_single_char_mmap(delete_chars[0], data, writer);
2173 }
2174 if delete_chars.len() <= 3 {
2175 return delete_multi_memchr_mmap(delete_chars, data, writer);
2176 }
2177
2178 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
2180 return delete_range_mmap(data, writer, lo, hi);
2181 }
2182
2183 let member = build_member_set(delete_chars);
2184
2185 if data.len() >= PARALLEL_THRESHOLD {
2189 let n_threads = rayon::current_num_threads().max(1);
2190 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2191
2192 let mut outbuf = alloc_uninit_vec(data.len());
2194 let chunk_lens: Vec<usize> = data
2195 .par_chunks(chunk_size)
2196 .zip(outbuf.par_chunks_mut(chunk_size))
2197 .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
2198 .collect();
2199
2200 let mut write_pos = 0;
2204 let mut src_offset = 0;
2205 for &clen in &chunk_lens {
2206 if clen > 0 && src_offset != write_pos {
2207 unsafe {
2208 std::ptr::copy(
2209 outbuf.as_ptr().add(src_offset),
2210 outbuf.as_mut_ptr().add(write_pos),
2211 clen,
2212 );
2213 }
2214 }
2215 write_pos += clen;
2216 src_offset += chunk_size;
2217 }
2218
2219 return writer.write_all(&outbuf[..write_pos]);
2220 }
2221
2222 if data.len() <= SINGLE_WRITE_LIMIT {
2224 let mut outbuf = alloc_uninit_vec(data.len());
2225 let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
2226 return writer.write_all(&outbuf[..out_pos]);
2227 }
2228
2229 let buf_size = data.len().min(BUF_SIZE);
2231 let mut outbuf = alloc_uninit_vec(buf_size);
2232
2233 for chunk in data.chunks(buf_size) {
2234 let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
2235 writer.write_all(&outbuf[..out_pos])?;
2236 }
2237 Ok(())
2238}
2239
2240fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
2242 if data.len() >= PARALLEL_THRESHOLD {
2244 let n_threads = rayon::current_num_threads().max(1);
2245 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2246
2247 let results: Vec<Vec<u8>> = data
2248 .par_chunks(chunk_size)
2249 .map(|chunk| {
2250 let mut out = alloc_uninit_vec(chunk.len());
2251 let wp = delete_range_chunk(chunk, &mut out, lo, hi);
2252 unsafe { out.set_len(wp) };
2253 out
2254 })
2255 .collect();
2256
2257 let slices: Vec<std::io::IoSlice> = results
2258 .iter()
2259 .filter(|r| !r.is_empty())
2260 .map(|r| std::io::IoSlice::new(r))
2261 .collect();
2262 return write_ioslices(writer, &slices);
2263 }
2264
2265 if data.len() <= SINGLE_WRITE_LIMIT {
2267 let mut outbuf = alloc_uninit_vec(data.len());
2268 let wp = delete_range_chunk(data, &mut outbuf, lo, hi);
2269 return writer.write_all(&outbuf[..wp]);
2270 }
2271
2272 let mut outbuf = alloc_uninit_vec(BUF_SIZE);
2274 for chunk in data.chunks(BUF_SIZE) {
2275 let wp = delete_range_chunk(chunk, &mut outbuf, lo, hi);
2276 writer.write_all(&outbuf[..wp])?;
2277 }
2278 Ok(())
2279}
2280
2281#[inline]
2284fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
2285 let len = chunk.len();
2286 let mut out_pos = 0;
2287 let mut i = 0;
2288
2289 while i + 8 <= len {
2290 unsafe {
2291 let b0 = *chunk.get_unchecked(i);
2292 let b1 = *chunk.get_unchecked(i + 1);
2293 let b2 = *chunk.get_unchecked(i + 2);
2294 let b3 = *chunk.get_unchecked(i + 3);
2295 let b4 = *chunk.get_unchecked(i + 4);
2296 let b5 = *chunk.get_unchecked(i + 5);
2297 let b6 = *chunk.get_unchecked(i + 6);
2298 let b7 = *chunk.get_unchecked(i + 7);
2299
2300 *outbuf.get_unchecked_mut(out_pos) = b0;
2301 out_pos += !is_member(member, b0) as usize;
2302 *outbuf.get_unchecked_mut(out_pos) = b1;
2303 out_pos += !is_member(member, b1) as usize;
2304 *outbuf.get_unchecked_mut(out_pos) = b2;
2305 out_pos += !is_member(member, b2) as usize;
2306 *outbuf.get_unchecked_mut(out_pos) = b3;
2307 out_pos += !is_member(member, b3) as usize;
2308 *outbuf.get_unchecked_mut(out_pos) = b4;
2309 out_pos += !is_member(member, b4) as usize;
2310 *outbuf.get_unchecked_mut(out_pos) = b5;
2311 out_pos += !is_member(member, b5) as usize;
2312 *outbuf.get_unchecked_mut(out_pos) = b6;
2313 out_pos += !is_member(member, b6) as usize;
2314 *outbuf.get_unchecked_mut(out_pos) = b7;
2315 out_pos += !is_member(member, b7) as usize;
2316 }
2317 i += 8;
2318 }
2319
2320 while i < len {
2321 unsafe {
2322 let b = *chunk.get_unchecked(i);
2323 *outbuf.get_unchecked_mut(out_pos) = b;
2324 out_pos += !is_member(member, b) as usize;
2325 }
2326 i += 1;
2327 }
2328
2329 out_pos
2330}
2331
2332fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2333 let mut iov: Vec<std::io::IoSlice> = Vec::new();
2340 let mut last = 0;
2341 for pos in memchr::memchr_iter(ch, data) {
2342 if pos > last {
2343 iov.push(std::io::IoSlice::new(&data[last..pos]));
2344 }
2345 last = pos + 1;
2346 }
2347 if last < data.len() {
2348 iov.push(std::io::IoSlice::new(&data[last..]));
2349 }
2350 write_ioslices(writer, &iov)
2351}
2352
2353fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2354 let c0 = chars[0];
2355 let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
2356 let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
2357 let is_three = chars.len() >= 3;
2358
2359 let mut iov: Vec<std::io::IoSlice> = Vec::new();
2362 let mut last = 0;
2363 if is_three {
2364 for pos in memchr::memchr3_iter(c0, c1, c2, data) {
2365 if pos > last {
2366 iov.push(std::io::IoSlice::new(&data[last..pos]));
2367 }
2368 last = pos + 1;
2369 }
2370 } else {
2371 for pos in memchr::memchr2_iter(c0, c1, data) {
2372 if pos > last {
2373 iov.push(std::io::IoSlice::new(&data[last..pos]));
2374 }
2375 last = pos + 1;
2376 }
2377 }
2378 if last < data.len() {
2379 iov.push(std::io::IoSlice::new(&data[last..]));
2380 }
2381 write_ioslices(writer, &iov)
2382}
2383
2384pub fn delete_squeeze_mmap(
2389 delete_chars: &[u8],
2390 squeeze_chars: &[u8],
2391 data: &[u8],
2392 writer: &mut impl Write,
2393) -> io::Result<()> {
2394 let delete_set = build_member_set(delete_chars);
2395 let squeeze_set = build_member_set(squeeze_chars);
2396
2397 if data.len() <= SINGLE_WRITE_LIMIT {
2399 let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2400 let mut last_squeezed: u16 = 256;
2401 unsafe {
2402 outbuf.set_len(data.len());
2403 let outp: *mut u8 = outbuf.as_mut_ptr();
2404 let inp = data.as_ptr();
2405 let len = data.len();
2406 let mut out_pos = 0;
2407 let mut i = 0;
2408 while i < len {
2409 let b = *inp.add(i);
2410 if is_member(&delete_set, b) {
2411 i += 1;
2412 continue;
2413 }
2414 if is_member(&squeeze_set, b) {
2415 if last_squeezed == b as u16 {
2416 i += 1;
2417 continue;
2418 }
2419 last_squeezed = b as u16;
2420 } else {
2421 last_squeezed = 256;
2422 }
2423 *outp.add(out_pos) = b;
2424 out_pos += 1;
2425 i += 1;
2426 }
2427 outbuf.set_len(out_pos);
2428 }
2429 return writer.write_all(&outbuf);
2430 }
2431
2432 let buf_size = data.len().min(BUF_SIZE);
2434 let mut outbuf = vec![0u8; buf_size];
2435 let mut last_squeezed: u16 = 256;
2436
2437 for chunk in data.chunks(buf_size) {
2438 let mut out_pos = 0;
2439 for &b in chunk {
2440 if is_member(&delete_set, b) {
2441 continue;
2442 }
2443 if is_member(&squeeze_set, b) {
2444 if last_squeezed == b as u16 {
2445 continue;
2446 }
2447 last_squeezed = b as u16;
2448 } else {
2449 last_squeezed = 256;
2450 }
2451 unsafe {
2452 *outbuf.get_unchecked_mut(out_pos) = b;
2453 }
2454 out_pos += 1;
2455 }
2456 writer.write_all(&outbuf[..out_pos])?;
2457 }
2458 Ok(())
2459}
2460
2461pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2467 if squeeze_chars.len() == 1 {
2468 return squeeze_single_mmap(squeeze_chars[0], data, writer);
2469 }
2470 if squeeze_chars.len() == 2 {
2471 return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
2472 }
2473 if squeeze_chars.len() == 3 {
2474 return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
2475 }
2476
2477 let member = build_member_set(squeeze_chars);
2478
2479 if data.len() >= PARALLEL_THRESHOLD {
2481 let n_threads = rayon::current_num_threads().max(1);
2482 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2483
2484 let results: Vec<Vec<u8>> = data
2485 .par_chunks(chunk_size)
2486 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2487 .collect();
2488
2489 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2494 for (idx, result) in results.iter().enumerate() {
2495 if result.is_empty() {
2496 continue;
2497 }
2498 if idx > 0 {
2499 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2501 if is_member(&member, prev_last) {
2502 let skip = result.iter().take_while(|&&b| b == prev_last).count();
2504 if skip < result.len() {
2505 slices.push(std::io::IoSlice::new(&result[skip..]));
2506 }
2507 continue;
2508 }
2509 }
2510 }
2511 slices.push(std::io::IoSlice::new(result));
2512 }
2513 return write_ioslices(writer, &slices);
2514 }
2515
2516 if data.len() <= SINGLE_WRITE_LIMIT {
2518 let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2519 let mut last_squeezed: u16 = 256;
2520 let len = data.len();
2521 let mut wp = 0;
2522 let mut i = 0;
2523
2524 unsafe {
2525 outbuf.set_len(data.len());
2526 let inp = data.as_ptr();
2527 let outp: *mut u8 = outbuf.as_mut_ptr();
2528
2529 while i < len {
2530 let b = *inp.add(i);
2531 if is_member(&member, b) {
2532 if last_squeezed != b as u16 {
2533 *outp.add(wp) = b;
2534 wp += 1;
2535 last_squeezed = b as u16;
2536 }
2537 i += 1;
2538 while i < len && *inp.add(i) == b {
2539 i += 1;
2540 }
2541 } else {
2542 last_squeezed = 256;
2543 *outp.add(wp) = b;
2544 wp += 1;
2545 i += 1;
2546 }
2547 }
2548 outbuf.set_len(wp);
2549 }
2550 return writer.write_all(&outbuf);
2551 }
2552
2553 let buf_size = data.len().min(BUF_SIZE);
2555 let mut outbuf = vec![0u8; buf_size];
2556 let mut last_squeezed: u16 = 256;
2557
2558 for chunk in data.chunks(buf_size) {
2559 let len = chunk.len();
2560 let mut wp = 0;
2561 let mut i = 0;
2562
2563 unsafe {
2564 let inp = chunk.as_ptr();
2565 let outp = outbuf.as_mut_ptr();
2566
2567 while i < len {
2568 let b = *inp.add(i);
2569 if is_member(&member, b) {
2570 if last_squeezed != b as u16 {
2571 *outp.add(wp) = b;
2572 wp += 1;
2573 last_squeezed = b as u16;
2574 }
2575 i += 1;
2576 while i < len && *inp.add(i) == b {
2577 i += 1;
2578 }
2579 } else {
2580 last_squeezed = 256;
2581 *outp.add(wp) = b;
2582 wp += 1;
2583 i += 1;
2584 }
2585 }
2586 }
2587 writer.write_all(&outbuf[..wp])?;
2588 }
2589 Ok(())
2590}
2591
2592fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
2594 let len = chunk.len();
2595 let mut out = Vec::with_capacity(len);
2596 let mut last_squeezed: u16 = 256;
2597 let mut i = 0;
2598
2599 unsafe {
2600 out.set_len(len);
2601 let inp = chunk.as_ptr();
2602 let outp: *mut u8 = out.as_mut_ptr();
2603 let mut wp = 0;
2604
2605 while i < len {
2606 let b = *inp.add(i);
2607 if is_member(member, b) {
2608 if last_squeezed != b as u16 {
2609 *outp.add(wp) = b;
2610 wp += 1;
2611 last_squeezed = b as u16;
2612 }
2613 i += 1;
2614 while i < len && *inp.add(i) == b {
2615 i += 1;
2616 }
2617 } else {
2618 last_squeezed = 256;
2619 *outp.add(wp) = b;
2620 wp += 1;
2621 i += 1;
2622 }
2623 }
2624 out.set_len(wp);
2625 }
2626 out
2627}
2628
2629fn squeeze_multi_mmap<const N: usize>(
2630 chars: &[u8],
2631 data: &[u8],
2632 writer: &mut impl Write,
2633) -> io::Result<()> {
2634 if data.len() >= PARALLEL_THRESHOLD {
2636 let member = build_member_set(chars);
2637 let n_threads = rayon::current_num_threads().max(1);
2638 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2639
2640 let results: Vec<Vec<u8>> = data
2641 .par_chunks(chunk_size)
2642 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2643 .collect();
2644
2645 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2647 for (idx, result) in results.iter().enumerate() {
2648 if result.is_empty() {
2649 continue;
2650 }
2651 if idx > 0 {
2652 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2653 if is_member(&member, prev_last) {
2654 let skip = result.iter().take_while(|&&b| b == prev_last).count();
2655 if skip < result.len() {
2656 slices.push(std::io::IoSlice::new(&result[skip..]));
2657 }
2658 continue;
2659 }
2660 }
2661 }
2662 slices.push(std::io::IoSlice::new(result));
2663 }
2664 return write_ioslices(writer, &slices);
2665 }
2666
2667 let buf_size = data.len().min(BUF_SIZE);
2668 let mut outbuf = vec![0u8; buf_size];
2669 let mut wp = 0;
2670 let mut last_squeezed: u16 = 256;
2671 let mut cursor = 0;
2672
2673 macro_rules! find_next {
2674 ($data:expr) => {
2675 if N == 2 {
2676 memchr::memchr2(chars[0], chars[1], $data)
2677 } else {
2678 memchr::memchr3(chars[0], chars[1], chars[2], $data)
2679 }
2680 };
2681 }
2682
2683 macro_rules! flush_and_copy {
2684 ($src:expr, $len:expr) => {
2685 if wp + $len > buf_size {
2686 writer.write_all(&outbuf[..wp])?;
2687 wp = 0;
2688 }
2689 if $len > buf_size {
2690 writer.write_all($src)?;
2691 } else {
2692 outbuf[wp..wp + $len].copy_from_slice($src);
2693 wp += $len;
2694 }
2695 };
2696 }
2697
2698 while cursor < data.len() {
2699 match find_next!(&data[cursor..]) {
2700 Some(offset) => {
2701 let pos = cursor + offset;
2702 let b = data[pos];
2703 if pos > cursor {
2704 let span = pos - cursor;
2705 flush_and_copy!(&data[cursor..pos], span);
2706 last_squeezed = 256;
2707 }
2708 if last_squeezed != b as u16 {
2709 if wp >= buf_size {
2710 writer.write_all(&outbuf[..wp])?;
2711 wp = 0;
2712 }
2713 outbuf[wp] = b;
2714 wp += 1;
2715 last_squeezed = b as u16;
2716 }
2717 let mut skip = pos + 1;
2718 while skip < data.len() && data[skip] == b {
2719 skip += 1;
2720 }
2721 cursor = skip;
2722 }
2723 None => {
2724 let remaining = data.len() - cursor;
2725 flush_and_copy!(&data[cursor..], remaining);
2726 break;
2727 }
2728 }
2729 }
2730 if wp > 0 {
2731 writer.write_all(&outbuf[..wp])?;
2732 }
2733 Ok(())
2734}
2735
2736fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2737 if data.is_empty() {
2738 return Ok(());
2739 }
2740
2741 if memchr::memmem::find(data, &[ch, ch]).is_none() {
2742 return writer.write_all(data);
2743 }
2744
2745 if data.len() >= PARALLEL_THRESHOLD {
2747 let n_threads = rayon::current_num_threads().max(1);
2748 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2749
2750 let results: Vec<Vec<u8>> = data
2751 .par_chunks(chunk_size)
2752 .map(|chunk| {
2753 let mut out = Vec::with_capacity(chunk.len());
2754 let mut cursor = 0;
2755 while cursor < chunk.len() {
2756 match memchr::memchr(ch, &chunk[cursor..]) {
2757 Some(offset) => {
2758 let pos = cursor + offset;
2759 if pos > cursor {
2760 out.extend_from_slice(&chunk[cursor..pos]);
2761 }
2762 out.push(ch);
2763 cursor = pos + 1;
2764 while cursor < chunk.len() && chunk[cursor] == ch {
2765 cursor += 1;
2766 }
2767 }
2768 None => {
2769 out.extend_from_slice(&chunk[cursor..]);
2770 break;
2771 }
2772 }
2773 }
2774 out
2775 })
2776 .collect();
2777
2778 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2781 for (idx, result) in results.iter().enumerate() {
2782 if result.is_empty() {
2783 continue;
2784 }
2785 if idx > 0 {
2786 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2787 if prev_last == ch {
2788 let skip = result.iter().take_while(|&&b| b == ch).count();
2790 if skip < result.len() {
2791 slices.push(std::io::IoSlice::new(&result[skip..]));
2792 }
2793 continue;
2794 }
2795 }
2796 }
2797 slices.push(std::io::IoSlice::new(result));
2798 }
2799 return write_ioslices(writer, &slices);
2800 }
2801
2802 let buf_size = data.len().min(BUF_SIZE);
2803 let mut outbuf = vec![0u8; buf_size];
2804 let len = data.len();
2805 let mut wp = 0;
2806 let mut cursor = 0;
2807
2808 while cursor < len {
2809 match memchr::memchr(ch, &data[cursor..]) {
2810 Some(offset) => {
2811 let pos = cursor + offset;
2812 let gap = pos - cursor;
2813 if gap > 0 {
2814 if wp + gap > buf_size {
2815 writer.write_all(&outbuf[..wp])?;
2816 wp = 0;
2817 }
2818 if gap > buf_size {
2819 writer.write_all(&data[cursor..pos])?;
2820 } else {
2821 outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
2822 wp += gap;
2823 }
2824 }
2825 if wp >= buf_size {
2826 writer.write_all(&outbuf[..wp])?;
2827 wp = 0;
2828 }
2829 outbuf[wp] = ch;
2830 wp += 1;
2831 cursor = pos + 1;
2832 while cursor < len && data[cursor] == ch {
2833 cursor += 1;
2834 }
2835 }
2836 None => {
2837 let remaining = len - cursor;
2838 if remaining > 0 {
2839 if wp + remaining > buf_size {
2840 writer.write_all(&outbuf[..wp])?;
2841 wp = 0;
2842 }
2843 if remaining > buf_size {
2844 writer.write_all(&data[cursor..])?;
2845 } else {
2846 outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
2847 wp += remaining;
2848 }
2849 }
2850 break;
2851 }
2852 }
2853 }
2854
2855 if wp > 0 {
2856 writer.write_all(&outbuf[..wp])?;
2857 }
2858 Ok(())
2859}