1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const MAX_IOV: usize = 1024;
8
9const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12const STREAM_BUF: usize = 1024 * 1024;
15
16const TRANSLATE_STREAM_BUF: usize = 4 * 1024 * 1024;
20
21const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
24
25#[inline]
28fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
29 if slices.is_empty() {
30 return Ok(());
31 }
32 for batch in slices.chunks(MAX_IOV) {
33 let total: usize = batch.iter().map(|s| s.len()).sum();
34 match writer.write_vectored(batch) {
35 Ok(n) if n >= total => continue,
36 Ok(mut written) => {
37 for slice in batch {
39 let slen = slice.len();
40 if written >= slen {
41 written -= slen;
42 continue;
43 }
44 if written > 0 {
45 writer.write_all(&slice[written..])?;
46 written = 0;
47 } else {
48 writer.write_all(slice)?;
49 }
50 }
51 }
52 Err(e) => return Err(e),
53 }
54 }
55 Ok(())
56}
57
58#[inline]
61#[allow(clippy::uninit_vec)]
62fn alloc_uninit_vec(len: usize) -> Vec<u8> {
63 let mut v = Vec::with_capacity(len);
64 unsafe {
66 v.set_len(len);
67 }
68 v
69}
70
71#[inline]
73fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
74 let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
75 let last = set2.last().copied();
76 for (i, &from) in set1.iter().enumerate() {
77 table[from as usize] = if i < set2.len() {
78 set2[i]
79 } else {
80 last.unwrap_or(from)
81 };
82 }
83 table
84}
85
86#[inline]
88fn build_member_set(chars: &[u8]) -> [u8; 32] {
89 let mut set = [0u8; 32];
90 for &ch in chars {
91 set[ch as usize >> 3] |= 1 << (ch & 7);
92 }
93 set
94}
95
96#[inline(always)]
97fn is_member(set: &[u8; 32], ch: u8) -> bool {
98 unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
99}
100
101#[inline(always)]
105fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
106 #[cfg(target_arch = "x86_64")]
107 {
108 if is_x86_feature_detected!("avx2") {
109 unsafe { translate_inplace_avx2_table(data, table) };
110 return;
111 }
112 if is_x86_feature_detected!("ssse3") {
113 unsafe { translate_inplace_ssse3_table(data, table) };
114 return;
115 }
116 }
117 translate_inplace_scalar(data, table);
118}
119
120#[inline(always)]
122fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
123 let len = data.len();
124 let ptr = data.as_mut_ptr();
125 let mut i = 0;
126 unsafe {
127 while i + 8 <= len {
128 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
129 *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
130 *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
131 *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
132 *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
133 *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
134 *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
135 *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
136 i += 8;
137 }
138 while i < len {
139 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
140 i += 1;
141 }
142 }
143}
144
145#[cfg(target_arch = "x86_64")]
165#[target_feature(enable = "avx2")]
166unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
167 use std::arch::x86_64::*;
168
169 unsafe {
170 let len = data.len();
171 let ptr = data.as_mut_ptr();
172
173 let mut lut = [_mm256_setzero_si256(); 16];
177 for h in 0u8..16 {
178 let base = (h as usize) * 16;
179 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
180 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
182 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
183 }
184
185 let lo_mask = _mm256_set1_epi8(0x0F);
186
187 let mut i = 0;
188 while i + 32 <= len {
189 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
190 let lo_nibble = _mm256_and_si256(input, lo_mask);
191 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
192
193 let mut result = _mm256_setzero_si256();
196
197 macro_rules! do_nibble {
199 ($h:expr) => {
200 let h_val = _mm256_set1_epi8($h as i8);
201 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
202 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
203 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
204 };
205 }
206 do_nibble!(0);
207 do_nibble!(1);
208 do_nibble!(2);
209 do_nibble!(3);
210 do_nibble!(4);
211 do_nibble!(5);
212 do_nibble!(6);
213 do_nibble!(7);
214 do_nibble!(8);
215 do_nibble!(9);
216 do_nibble!(10);
217 do_nibble!(11);
218 do_nibble!(12);
219 do_nibble!(13);
220 do_nibble!(14);
221 do_nibble!(15);
222
223 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
224 i += 32;
225 }
226
227 if i + 16 <= len {
229 let lo_mask128 = _mm_set1_epi8(0x0F);
230
231 let mut lut128 = [_mm_setzero_si128(); 16];
233 for h in 0u8..16 {
234 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
235 }
236
237 let input = _mm_loadu_si128(ptr.add(i) as *const _);
238 let lo_nib = _mm_and_si128(input, lo_mask128);
239 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
240
241 let mut res = _mm_setzero_si128();
242 macro_rules! do_nibble128 {
243 ($h:expr) => {
244 let h_val = _mm_set1_epi8($h as i8);
245 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
246 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
247 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
248 };
249 }
250 do_nibble128!(0);
251 do_nibble128!(1);
252 do_nibble128!(2);
253 do_nibble128!(3);
254 do_nibble128!(4);
255 do_nibble128!(5);
256 do_nibble128!(6);
257 do_nibble128!(7);
258 do_nibble128!(8);
259 do_nibble128!(9);
260 do_nibble128!(10);
261 do_nibble128!(11);
262 do_nibble128!(12);
263 do_nibble128!(13);
264 do_nibble128!(14);
265 do_nibble128!(15);
266
267 _mm_storeu_si128(ptr.add(i) as *mut _, res);
268 i += 16;
269 }
270
271 while i < len {
273 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
274 i += 1;
275 }
276 }
277}
278
279#[cfg(target_arch = "x86_64")]
280#[target_feature(enable = "ssse3")]
281unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
282 use std::arch::x86_64::*;
283
284 unsafe {
285 let len = data.len();
286 let ptr = data.as_mut_ptr();
287
288 let mut lut = [_mm_setzero_si128(); 16];
290 for h in 0u8..16 {
291 let base = (h as usize) * 16;
292 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
293 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
294 }
295
296 let lo_mask = _mm_set1_epi8(0x0F);
297
298 let mut i = 0;
299 while i + 16 <= len {
300 let input = _mm_loadu_si128(ptr.add(i) as *const _);
301 let lo_nibble = _mm_and_si128(input, lo_mask);
302 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
303
304 let mut result = _mm_setzero_si128();
305
306 macro_rules! do_nibble {
307 ($h:expr) => {
308 let h_val = _mm_set1_epi8($h as i8);
309 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
310 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
311 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
312 };
313 }
314 do_nibble!(0);
315 do_nibble!(1);
316 do_nibble!(2);
317 do_nibble!(3);
318 do_nibble!(4);
319 do_nibble!(5);
320 do_nibble!(6);
321 do_nibble!(7);
322 do_nibble!(8);
323 do_nibble!(9);
324 do_nibble!(10);
325 do_nibble!(11);
326 do_nibble!(12);
327 do_nibble!(13);
328 do_nibble!(14);
329 do_nibble!(15);
330
331 _mm_storeu_si128(ptr.add(i) as *mut _, result);
332 i += 16;
333 }
334
335 while i < len {
337 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
338 i += 1;
339 }
340 }
341}
342
343#[inline(always)]
346fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
347 debug_assert!(dst.len() >= src.len());
348 #[cfg(target_arch = "x86_64")]
349 {
350 if is_x86_feature_detected!("avx2") {
351 unsafe { translate_to_avx2_table(src, dst, table) };
352 return;
353 }
354 if is_x86_feature_detected!("ssse3") {
355 unsafe { translate_to_ssse3_table(src, dst, table) };
356 return;
357 }
358 }
359 translate_to_scalar(src, dst, table);
360}
361
362#[inline(always)]
364fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
365 unsafe {
366 let sp = src.as_ptr();
367 let dp = dst.as_mut_ptr();
368 let len = src.len();
369 let mut i = 0;
370 while i + 8 <= len {
371 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
372 *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
373 *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
374 *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
375 *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
376 *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
377 *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
378 *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
379 i += 8;
380 }
381 while i < len {
382 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
383 i += 1;
384 }
385 }
386}
387
388#[cfg(target_arch = "x86_64")]
389#[target_feature(enable = "avx2")]
390unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
391 use std::arch::x86_64::*;
392
393 unsafe {
394 let len = src.len();
395 let sp = src.as_ptr();
396 let dp = dst.as_mut_ptr();
397
398 let mut lut = [_mm256_setzero_si256(); 16];
400 for h in 0u8..16 {
401 let base = (h as usize) * 16;
402 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
403 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
404 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
405 }
406
407 let lo_mask = _mm256_set1_epi8(0x0F);
408
409 let mut i = 0;
410 while i + 32 <= len {
411 let input = _mm256_loadu_si256(sp.add(i) as *const _);
412 let lo_nibble = _mm256_and_si256(input, lo_mask);
413 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
414
415 let mut result = _mm256_setzero_si256();
416
417 macro_rules! do_nibble {
418 ($h:expr) => {
419 let h_val = _mm256_set1_epi8($h as i8);
420 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
421 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
422 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
423 };
424 }
425 do_nibble!(0);
426 do_nibble!(1);
427 do_nibble!(2);
428 do_nibble!(3);
429 do_nibble!(4);
430 do_nibble!(5);
431 do_nibble!(6);
432 do_nibble!(7);
433 do_nibble!(8);
434 do_nibble!(9);
435 do_nibble!(10);
436 do_nibble!(11);
437 do_nibble!(12);
438 do_nibble!(13);
439 do_nibble!(14);
440 do_nibble!(15);
441
442 _mm256_storeu_si256(dp.add(i) as *mut _, result);
443 i += 32;
444 }
445
446 if i + 16 <= len {
448 let lo_mask128 = _mm_set1_epi8(0x0F);
449 let mut lut128 = [_mm_setzero_si128(); 16];
450 for h in 0u8..16 {
451 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
452 }
453
454 let input = _mm_loadu_si128(sp.add(i) as *const _);
455 let lo_nib = _mm_and_si128(input, lo_mask128);
456 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
457
458 let mut res = _mm_setzero_si128();
459 macro_rules! do_nibble128 {
460 ($h:expr) => {
461 let h_val = _mm_set1_epi8($h as i8);
462 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
463 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
464 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
465 };
466 }
467 do_nibble128!(0);
468 do_nibble128!(1);
469 do_nibble128!(2);
470 do_nibble128!(3);
471 do_nibble128!(4);
472 do_nibble128!(5);
473 do_nibble128!(6);
474 do_nibble128!(7);
475 do_nibble128!(8);
476 do_nibble128!(9);
477 do_nibble128!(10);
478 do_nibble128!(11);
479 do_nibble128!(12);
480 do_nibble128!(13);
481 do_nibble128!(14);
482 do_nibble128!(15);
483
484 _mm_storeu_si128(dp.add(i) as *mut _, res);
485 i += 16;
486 }
487
488 while i < len {
490 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
491 i += 1;
492 }
493 }
494}
495
496#[cfg(target_arch = "x86_64")]
497#[target_feature(enable = "ssse3")]
498unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
499 use std::arch::x86_64::*;
500
501 unsafe {
502 let len = src.len();
503 let sp = src.as_ptr();
504 let dp = dst.as_mut_ptr();
505
506 let mut lut = [_mm_setzero_si128(); 16];
507 for h in 0u8..16 {
508 let base = (h as usize) * 16;
509 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
510 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
511 }
512
513 let lo_mask = _mm_set1_epi8(0x0F);
514
515 let mut i = 0;
516 while i + 16 <= len {
517 let input = _mm_loadu_si128(sp.add(i) as *const _);
518 let lo_nibble = _mm_and_si128(input, lo_mask);
519 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
520
521 let mut result = _mm_setzero_si128();
522
523 macro_rules! do_nibble {
524 ($h:expr) => {
525 let h_val = _mm_set1_epi8($h as i8);
526 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
527 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
528 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
529 };
530 }
531 do_nibble!(0);
532 do_nibble!(1);
533 do_nibble!(2);
534 do_nibble!(3);
535 do_nibble!(4);
536 do_nibble!(5);
537 do_nibble!(6);
538 do_nibble!(7);
539 do_nibble!(8);
540 do_nibble!(9);
541 do_nibble!(10);
542 do_nibble!(11);
543 do_nibble!(12);
544 do_nibble!(13);
545 do_nibble!(14);
546 do_nibble!(15);
547
548 _mm_storeu_si128(dp.add(i) as *mut _, result);
549 i += 16;
550 }
551
552 while i < len {
554 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
555 i += 1;
556 }
557 }
558}
559
560#[inline]
568fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
569 let mut lo: Option<u8> = None;
570 let mut hi = 0u8;
571 let mut offset = 0i16;
572
573 for i in 0..256 {
574 if table[i] != i as u8 {
575 let diff = table[i] as i16 - i as i16;
576 match lo {
577 None => {
578 lo = Some(i as u8);
579 hi = i as u8;
580 offset = diff;
581 }
582 Some(_) => {
583 if diff != offset || i as u8 != hi.wrapping_add(1) {
584 return None;
585 }
586 hi = i as u8;
587 }
588 }
589 }
590 }
591
592 lo.map(|l| (l, hi, offset as i8))
593}
594
595#[cfg(target_arch = "x86_64")]
599fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
600 if is_x86_feature_detected!("avx2") {
601 unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
602 } else {
603 unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
604 }
605}
606
607#[cfg(target_arch = "x86_64")]
608#[target_feature(enable = "avx2")]
609unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
610 use std::arch::x86_64::*;
611
612 unsafe {
613 let range = hi - lo;
614 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
619 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
620 let offset_v = _mm256_set1_epi8(offset);
621 let zero = _mm256_setzero_si256();
622
623 let len = src.len();
624 let mut i = 0;
625
626 while i + 32 <= len {
627 let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
628 let biased = _mm256_add_epi8(input, bias_v);
629 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
631 let mask = _mm256_cmpeq_epi8(gt, zero);
633 let offset_masked = _mm256_and_si256(mask, offset_v);
634 let result = _mm256_add_epi8(input, offset_masked);
635 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
636 i += 32;
637 }
638
639 if i + 16 <= len {
641 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
642 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
643 let offset_v128 = _mm_set1_epi8(offset);
644 let zero128 = _mm_setzero_si128();
645
646 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
647 let biased = _mm_add_epi8(input, bias_v128);
648 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
649 let mask = _mm_cmpeq_epi8(gt, zero128);
650 let offset_masked = _mm_and_si128(mask, offset_v128);
651 let result = _mm_add_epi8(input, offset_masked);
652 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
653 i += 16;
654 }
655
656 while i < len {
658 let b = *src.get_unchecked(i);
659 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
660 b.wrapping_add(offset as u8)
661 } else {
662 b
663 };
664 i += 1;
665 }
666 }
667}
668
669#[cfg(target_arch = "x86_64")]
670#[target_feature(enable = "sse2")]
671unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
672 use std::arch::x86_64::*;
673
674 unsafe {
675 let range = hi - lo;
676 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
677 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
678 let offset_v = _mm_set1_epi8(offset);
679 let zero = _mm_setzero_si128();
680
681 let len = src.len();
682 let mut i = 0;
683
684 while i + 16 <= len {
685 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
686 let biased = _mm_add_epi8(input, bias_v);
687 let gt = _mm_cmpgt_epi8(biased, threshold_v);
688 let mask = _mm_cmpeq_epi8(gt, zero);
689 let offset_masked = _mm_and_si128(mask, offset_v);
690 let result = _mm_add_epi8(input, offset_masked);
691 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
692 i += 16;
693 }
694
695 while i < len {
696 let b = *src.get_unchecked(i);
697 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
698 b.wrapping_add(offset as u8)
699 } else {
700 b
701 };
702 i += 1;
703 }
704 }
705}
706
707#[cfg(not(target_arch = "x86_64"))]
709fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
710 for (i, &b) in src.iter().enumerate() {
711 dst[i] = if b >= lo && b <= hi {
712 b.wrapping_add(offset as u8)
713 } else {
714 b
715 };
716 }
717}
718
719#[cfg(target_arch = "x86_64")]
727fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
728 if is_x86_feature_detected!("avx2") {
729 unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
730 } else {
731 unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
732 }
733}
734
735#[cfg(target_arch = "x86_64")]
736#[target_feature(enable = "avx2")]
737unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
738 use std::arch::x86_64::*;
739
740 unsafe {
741 let range = hi - lo;
742 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
743 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
744 let offset_v = _mm256_set1_epi8(offset);
745 let zero = _mm256_setzero_si256();
746
747 let len = data.len();
748 let ptr = data.as_mut_ptr();
749 let mut i = 0;
750
751 while i + 32 <= len {
752 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
753 let biased = _mm256_add_epi8(input, bias_v);
754 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
755 let mask = _mm256_cmpeq_epi8(gt, zero);
756 let offset_masked = _mm256_and_si256(mask, offset_v);
757 let result = _mm256_add_epi8(input, offset_masked);
758 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
759 i += 32;
760 }
761
762 if i + 16 <= len {
763 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
764 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
765 let offset_v128 = _mm_set1_epi8(offset);
766 let zero128 = _mm_setzero_si128();
767
768 let input = _mm_loadu_si128(ptr.add(i) as *const _);
769 let biased = _mm_add_epi8(input, bias_v128);
770 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
771 let mask = _mm_cmpeq_epi8(gt, zero128);
772 let offset_masked = _mm_and_si128(mask, offset_v128);
773 let result = _mm_add_epi8(input, offset_masked);
774 _mm_storeu_si128(ptr.add(i) as *mut _, result);
775 i += 16;
776 }
777
778 while i < len {
779 let b = *ptr.add(i);
780 *ptr.add(i) = if b >= lo && b <= hi {
781 b.wrapping_add(offset as u8)
782 } else {
783 b
784 };
785 i += 1;
786 }
787 }
788}
789
790#[cfg(target_arch = "x86_64")]
791#[target_feature(enable = "sse2")]
792unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
793 use std::arch::x86_64::*;
794
795 unsafe {
796 let range = hi - lo;
797 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
798 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
799 let offset_v = _mm_set1_epi8(offset);
800 let zero = _mm_setzero_si128();
801
802 let len = data.len();
803 let ptr = data.as_mut_ptr();
804 let mut i = 0;
805
806 while i + 16 <= len {
807 let input = _mm_loadu_si128(ptr.add(i) as *const _);
808 let biased = _mm_add_epi8(input, bias_v);
809 let gt = _mm_cmpgt_epi8(biased, threshold_v);
810 let mask = _mm_cmpeq_epi8(gt, zero);
811 let offset_masked = _mm_and_si128(mask, offset_v);
812 let result = _mm_add_epi8(input, offset_masked);
813 _mm_storeu_si128(ptr.add(i) as *mut _, result);
814 i += 16;
815 }
816
817 while i < len {
818 let b = *ptr.add(i);
819 *ptr.add(i) = if b >= lo && b <= hi {
820 b.wrapping_add(offset as u8)
821 } else {
822 b
823 };
824 i += 1;
825 }
826 }
827}
828
829#[cfg(not(target_arch = "x86_64"))]
830fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
831 for b in data.iter_mut() {
832 if *b >= lo && *b <= hi {
833 *b = b.wrapping_add(offset as u8);
834 }
835 }
836}
837
838#[inline]
848fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
849 if chars.is_empty() {
850 return None;
851 }
852 let mut lo = chars[0];
853 let mut hi = chars[0];
854 for &c in &chars[1..] {
855 if c < lo {
856 lo = c;
857 }
858 if c > hi {
859 hi = c;
860 }
861 }
862 if (hi - lo + 1) as usize == chars.len() {
864 Some((lo, hi))
865 } else {
866 None
867 }
868}
869
870#[cfg(target_arch = "x86_64")]
874fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
875 if is_x86_feature_detected!("avx2") {
876 unsafe { delete_range_avx2(src, dst, lo, hi) }
877 } else {
878 unsafe { delete_range_sse2(src, dst, lo, hi) }
879 }
880}
881
882#[cfg(target_arch = "x86_64")]
883#[target_feature(enable = "avx2")]
884unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
885 use std::arch::x86_64::*;
886
887 unsafe {
888 let range = hi - lo;
889 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
890 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
891 let zero = _mm256_setzero_si256();
892
893 let len = src.len();
894 let sp = src.as_ptr();
895 let dp = dst.as_mut_ptr();
896 let mut ri = 0;
897 let mut wp = 0;
898
899 while ri + 32 <= len {
900 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
901 let biased = _mm256_add_epi8(input, bias_v);
902 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
904 let in_range = _mm256_cmpeq_epi8(gt, zero);
906 let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
908
909 let mut mask = keep_mask;
911 while mask != 0 {
912 let bit = mask.trailing_zeros() as usize;
913 *dp.add(wp) = *sp.add(ri + bit);
914 wp += 1;
915 mask &= mask - 1; }
917 ri += 32;
918 }
919
920 if ri + 16 <= len {
922 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
923 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
924 let zero128 = _mm_setzero_si128();
925
926 let input = _mm_loadu_si128(sp.add(ri) as *const _);
927 let biased = _mm_add_epi8(input, bias_v128);
928 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
929 let in_range = _mm_cmpeq_epi8(gt, zero128);
930 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
931
932 let mut mask = keep_mask;
933 while mask != 0 {
934 let bit = mask.trailing_zeros() as usize;
935 *dp.add(wp) = *sp.add(ri + bit);
936 wp += 1;
937 mask &= mask - 1;
938 }
939 ri += 16;
940 }
941
942 while ri < len {
944 let b = *sp.add(ri);
945 if b < lo || b > hi {
946 *dp.add(wp) = b;
947 wp += 1;
948 }
949 ri += 1;
950 }
951
952 wp
953 }
954}
955
956#[cfg(target_arch = "x86_64")]
957#[target_feature(enable = "sse2")]
958unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
959 use std::arch::x86_64::*;
960
961 unsafe {
962 let range = hi - lo;
963 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
964 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
965 let zero = _mm_setzero_si128();
966
967 let len = src.len();
968 let sp = src.as_ptr();
969 let dp = dst.as_mut_ptr();
970 let mut ri = 0;
971 let mut wp = 0;
972
973 while ri + 16 <= len {
974 let input = _mm_loadu_si128(sp.add(ri) as *const _);
975 let biased = _mm_add_epi8(input, bias_v);
976 let gt = _mm_cmpgt_epi8(biased, threshold_v);
977 let in_range = _mm_cmpeq_epi8(gt, zero);
978 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
979
980 let mut mask = keep_mask;
981 while mask != 0 {
982 let bit = mask.trailing_zeros() as usize;
983 *dp.add(wp) = *sp.add(ri + bit);
984 wp += 1;
985 mask &= mask - 1;
986 }
987 ri += 16;
988 }
989
990 while ri < len {
991 let b = *sp.add(ri);
992 if b < lo || b > hi {
993 *dp.add(wp) = b;
994 wp += 1;
995 }
996 ri += 1;
997 }
998
999 wp
1000 }
1001}
1002
1003#[cfg(not(target_arch = "x86_64"))]
1005fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1006 let mut wp = 0;
1007 for &b in src {
1008 if b < lo || b > hi {
1009 dst[wp] = b;
1010 wp += 1;
1011 }
1012 }
1013 wp
1014}
1015
1016fn delete_range_streaming(
1018 lo: u8,
1019 hi: u8,
1020 reader: &mut impl Read,
1021 writer: &mut impl Write,
1022) -> io::Result<()> {
1023 let mut src = vec![0u8; STREAM_BUF];
1024 let mut dst = alloc_uninit_vec(STREAM_BUF);
1025 loop {
1026 let n = read_full(reader, &mut src)?;
1027 if n == 0 {
1028 break;
1029 }
1030 let wp = delete_range_chunk(&src[..n], &mut dst, lo, hi);
1031 if wp > 0 {
1032 writer.write_all(&dst[..wp])?;
1033 }
1034 }
1035 Ok(())
1036}
1037
1038pub fn translate(
1043 set1: &[u8],
1044 set2: &[u8],
1045 reader: &mut impl Read,
1046 writer: &mut impl Write,
1047) -> io::Result<()> {
1048 let table = build_translate_table(set1, set2);
1049
1050 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1052 if is_identity {
1053 return passthrough_stream(reader, writer);
1054 }
1055
1056 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1058 return translate_range_stream(lo, hi, offset, reader, writer);
1059 }
1060
1061 let mut buf = vec![0u8; TRANSLATE_STREAM_BUF];
1069 loop {
1070 let n = read_full(reader, &mut buf)?;
1071 if n == 0 {
1072 break;
1073 }
1074 translate_inplace(&mut buf[..n], &table);
1075 writer.write_all(&buf[..n])?;
1076 }
1077 Ok(())
1078}
1079
1080fn translate_range_stream(
1083 lo: u8,
1084 hi: u8,
1085 offset: i8,
1086 reader: &mut impl Read,
1087 writer: &mut impl Write,
1088) -> io::Result<()> {
1089 let mut buf = vec![0u8; TRANSLATE_STREAM_BUF];
1090 loop {
1091 let n = read_full(reader, &mut buf)?;
1092 if n == 0 {
1093 break;
1094 }
1095 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1096 writer.write_all(&buf[..n])?;
1097 }
1098 Ok(())
1099}
1100
1101fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1104 let mut buf = vec![0u8; TRANSLATE_STREAM_BUF];
1105 loop {
1106 let n = read_full(reader, &mut buf)?;
1107 if n == 0 {
1108 break;
1109 }
1110 writer.write_all(&buf[..n])?;
1111 }
1112 Ok(())
1113}
1114
1115#[inline]
1117fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
1118 let mut total = 0;
1119 while total < buf.len() {
1120 match reader.read(&mut buf[total..]) {
1121 Ok(0) => break,
1122 Ok(n) => total += n,
1123 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1124 Err(e) => return Err(e),
1125 }
1126 }
1127 Ok(total)
1128}
1129
1130pub fn translate_squeeze(
1131 set1: &[u8],
1132 set2: &[u8],
1133 reader: &mut impl Read,
1134 writer: &mut impl Write,
1135) -> io::Result<()> {
1136 let table = build_translate_table(set1, set2);
1137 let squeeze_set = build_member_set(set2);
1138
1139 let range_info = detect_range_offset(&table);
1145
1146 let mut buf = vec![0u8; STREAM_BUF];
1147 let mut last_squeezed: u16 = 256;
1148
1149 loop {
1150 let n = read_full(reader, &mut buf)?;
1151 if n == 0 {
1152 break;
1153 }
1154 if let Some((lo, hi, offset)) = range_info {
1156 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1157 } else {
1158 translate_inplace(&mut buf[..n], &table);
1159 }
1160 let mut wp = 0;
1162 unsafe {
1163 let ptr = buf.as_mut_ptr();
1164 for i in 0..n {
1165 let b = *ptr.add(i);
1166 if is_member(&squeeze_set, b) {
1167 if last_squeezed == b as u16 {
1168 continue;
1169 }
1170 last_squeezed = b as u16;
1171 } else {
1172 last_squeezed = 256;
1173 }
1174 *ptr.add(wp) = b;
1175 wp += 1;
1176 }
1177 }
1178 writer.write_all(&buf[..wp])?;
1179 }
1180 Ok(())
1181}
1182
1183pub fn delete(
1184 delete_chars: &[u8],
1185 reader: &mut impl Read,
1186 writer: &mut impl Write,
1187) -> io::Result<()> {
1188 if delete_chars.len() == 1 {
1189 return delete_single_streaming(delete_chars[0], reader, writer);
1190 }
1191 if delete_chars.len() <= 3 {
1192 return delete_multi_streaming(delete_chars, reader, writer);
1193 }
1194
1195 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1199 return delete_range_streaming(lo, hi, reader, writer);
1200 }
1201
1202 let member = build_member_set(delete_chars);
1203 let mut buf = vec![0u8; STREAM_BUF];
1204
1205 loop {
1206 let n = read_full(reader, &mut buf)?;
1207 if n == 0 {
1208 break;
1209 }
1210 let mut wp = 0;
1211 unsafe {
1212 let ptr = buf.as_mut_ptr();
1213 let mut i = 0;
1214 while i + 8 <= n {
1215 let b0 = *ptr.add(i);
1216 let b1 = *ptr.add(i + 1);
1217 let b2 = *ptr.add(i + 2);
1218 let b3 = *ptr.add(i + 3);
1219 let b4 = *ptr.add(i + 4);
1220 let b5 = *ptr.add(i + 5);
1221 let b6 = *ptr.add(i + 6);
1222 let b7 = *ptr.add(i + 7);
1223
1224 if !is_member(&member, b0) {
1225 *ptr.add(wp) = b0;
1226 wp += 1;
1227 }
1228 if !is_member(&member, b1) {
1229 *ptr.add(wp) = b1;
1230 wp += 1;
1231 }
1232 if !is_member(&member, b2) {
1233 *ptr.add(wp) = b2;
1234 wp += 1;
1235 }
1236 if !is_member(&member, b3) {
1237 *ptr.add(wp) = b3;
1238 wp += 1;
1239 }
1240 if !is_member(&member, b4) {
1241 *ptr.add(wp) = b4;
1242 wp += 1;
1243 }
1244 if !is_member(&member, b5) {
1245 *ptr.add(wp) = b5;
1246 wp += 1;
1247 }
1248 if !is_member(&member, b6) {
1249 *ptr.add(wp) = b6;
1250 wp += 1;
1251 }
1252 if !is_member(&member, b7) {
1253 *ptr.add(wp) = b7;
1254 wp += 1;
1255 }
1256 i += 8;
1257 }
1258 while i < n {
1259 let b = *ptr.add(i);
1260 if !is_member(&member, b) {
1261 *ptr.add(wp) = b;
1262 wp += 1;
1263 }
1264 i += 1;
1265 }
1266 }
1267 writer.write_all(&buf[..wp])?;
1268 }
1269 Ok(())
1270}
1271
1272fn delete_single_streaming(
1273 ch: u8,
1274 reader: &mut impl Read,
1275 writer: &mut impl Write,
1276) -> io::Result<()> {
1277 let mut buf = vec![0u8; STREAM_BUF];
1278 loop {
1279 let n = match reader.read(&mut buf) {
1280 Ok(0) => break,
1281 Ok(n) => n,
1282 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1283 Err(e) => return Err(e),
1284 };
1285 let mut wp = 0;
1286 let mut i = 0;
1287 while i < n {
1288 match memchr::memchr(ch, &buf[i..n]) {
1289 Some(offset) => {
1290 if offset > 0 {
1291 if wp != i {
1292 unsafe {
1293 std::ptr::copy(
1294 buf.as_ptr().add(i),
1295 buf.as_mut_ptr().add(wp),
1296 offset,
1297 );
1298 }
1299 }
1300 wp += offset;
1301 }
1302 i += offset + 1;
1303 }
1304 None => {
1305 let run_len = n - i;
1306 if run_len > 0 {
1307 if wp != i {
1308 unsafe {
1309 std::ptr::copy(
1310 buf.as_ptr().add(i),
1311 buf.as_mut_ptr().add(wp),
1312 run_len,
1313 );
1314 }
1315 }
1316 wp += run_len;
1317 }
1318 break;
1319 }
1320 }
1321 }
1322 writer.write_all(&buf[..wp])?;
1323 }
1324 Ok(())
1325}
1326
1327fn delete_multi_streaming(
1328 chars: &[u8],
1329 reader: &mut impl Read,
1330 writer: &mut impl Write,
1331) -> io::Result<()> {
1332 let mut buf = vec![0u8; STREAM_BUF];
1333 loop {
1334 let n = match reader.read(&mut buf) {
1335 Ok(0) => break,
1336 Ok(n) => n,
1337 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1338 Err(e) => return Err(e),
1339 };
1340 let mut wp = 0;
1341 let mut i = 0;
1342 while i < n {
1343 let found = if chars.len() == 2 {
1344 memchr::memchr2(chars[0], chars[1], &buf[i..n])
1345 } else {
1346 memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
1347 };
1348 match found {
1349 Some(offset) => {
1350 if offset > 0 {
1351 if wp != i {
1352 unsafe {
1353 std::ptr::copy(
1354 buf.as_ptr().add(i),
1355 buf.as_mut_ptr().add(wp),
1356 offset,
1357 );
1358 }
1359 }
1360 wp += offset;
1361 }
1362 i += offset + 1;
1363 }
1364 None => {
1365 let run_len = n - i;
1366 if run_len > 0 {
1367 if wp != i {
1368 unsafe {
1369 std::ptr::copy(
1370 buf.as_ptr().add(i),
1371 buf.as_mut_ptr().add(wp),
1372 run_len,
1373 );
1374 }
1375 }
1376 wp += run_len;
1377 }
1378 break;
1379 }
1380 }
1381 }
1382 writer.write_all(&buf[..wp])?;
1383 }
1384 Ok(())
1385}
1386
1387pub fn delete_squeeze(
1388 delete_chars: &[u8],
1389 squeeze_chars: &[u8],
1390 reader: &mut impl Read,
1391 writer: &mut impl Write,
1392) -> io::Result<()> {
1393 let delete_set = build_member_set(delete_chars);
1394 let squeeze_set = build_member_set(squeeze_chars);
1395 let mut buf = vec![0u8; STREAM_BUF];
1396 let mut last_squeezed: u16 = 256;
1397
1398 loop {
1399 let n = read_full(reader, &mut buf)?;
1400 if n == 0 {
1401 break;
1402 }
1403 let mut wp = 0;
1404 unsafe {
1405 let ptr = buf.as_mut_ptr();
1406 for i in 0..n {
1407 let b = *ptr.add(i);
1408 if is_member(&delete_set, b) {
1409 continue;
1410 }
1411 if is_member(&squeeze_set, b) {
1412 if last_squeezed == b as u16 {
1413 continue;
1414 }
1415 last_squeezed = b as u16;
1416 } else {
1417 last_squeezed = 256;
1418 }
1419 *ptr.add(wp) = b;
1420 wp += 1;
1421 }
1422 }
1423 writer.write_all(&buf[..wp])?;
1424 }
1425 Ok(())
1426}
1427
1428pub fn squeeze(
1429 squeeze_chars: &[u8],
1430 reader: &mut impl Read,
1431 writer: &mut impl Write,
1432) -> io::Result<()> {
1433 if squeeze_chars.len() == 1 {
1434 return squeeze_single_stream(squeeze_chars[0], reader, writer);
1435 }
1436
1437 let member = build_member_set(squeeze_chars);
1438 let mut buf = vec![0u8; STREAM_BUF];
1439 let mut last_squeezed: u16 = 256;
1440
1441 loop {
1442 let n = read_full(reader, &mut buf)?;
1443 if n == 0 {
1444 break;
1445 }
1446 let mut wp = 0;
1447 unsafe {
1448 let ptr = buf.as_mut_ptr();
1449 for i in 0..n {
1450 let b = *ptr.add(i);
1451 if is_member(&member, b) {
1452 if last_squeezed == b as u16 {
1453 continue;
1454 }
1455 last_squeezed = b as u16;
1456 } else {
1457 last_squeezed = 256;
1458 }
1459 *ptr.add(wp) = b;
1460 wp += 1;
1461 }
1462 }
1463 writer.write_all(&buf[..wp])?;
1464 }
1465 Ok(())
1466}
1467
1468fn squeeze_single_stream(
1469 ch: u8,
1470 reader: &mut impl Read,
1471 writer: &mut impl Write,
1472) -> io::Result<()> {
1473 let mut buf = vec![0u8; STREAM_BUF];
1474 let mut was_squeeze_char = false;
1475
1476 loop {
1477 let n = match reader.read(&mut buf) {
1478 Ok(0) => break,
1479 Ok(n) => n,
1480 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1481 Err(e) => return Err(e),
1482 };
1483
1484 let mut wp = 0;
1485 let mut i = 0;
1486
1487 while i < n {
1488 if was_squeeze_char && buf[i] == ch {
1489 i += 1;
1490 while i < n && buf[i] == ch {
1491 i += 1;
1492 }
1493 if i >= n {
1494 break;
1495 }
1496 }
1497
1498 match memchr::memchr(ch, &buf[i..n]) {
1499 Some(offset) => {
1500 let run_len = offset;
1501 if run_len > 0 {
1502 if wp != i {
1503 unsafe {
1504 std::ptr::copy(
1505 buf.as_ptr().add(i),
1506 buf.as_mut_ptr().add(wp),
1507 run_len,
1508 );
1509 }
1510 }
1511 wp += run_len;
1512 }
1513 i += run_len;
1514
1515 unsafe {
1516 *buf.as_mut_ptr().add(wp) = ch;
1517 }
1518 wp += 1;
1519 was_squeeze_char = true;
1520 i += 1;
1521 while i < n && buf[i] == ch {
1522 i += 1;
1523 }
1524 }
1525 None => {
1526 let run_len = n - i;
1527 if run_len > 0 {
1528 if wp != i {
1529 unsafe {
1530 std::ptr::copy(
1531 buf.as_ptr().add(i),
1532 buf.as_mut_ptr().add(wp),
1533 run_len,
1534 );
1535 }
1536 }
1537 wp += run_len;
1538 }
1539 was_squeeze_char = false;
1540 break;
1541 }
1542 }
1543 }
1544
1545 writer.write_all(&buf[..wp])?;
1546 }
1547 Ok(())
1548}
1549
1550const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
1558
1559pub fn translate_mmap(
1568 set1: &[u8],
1569 set2: &[u8],
1570 data: &[u8],
1571 writer: &mut impl Write,
1572) -> io::Result<()> {
1573 let table = build_translate_table(set1, set2);
1574
1575 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1577 if is_identity {
1578 return writer.write_all(data);
1579 }
1580
1581 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1583 return translate_mmap_range(data, writer, lo, hi, offset);
1584 }
1585
1586 translate_mmap_table(data, writer, &table)
1588}
1589
1590fn translate_mmap_range(
1592 data: &[u8],
1593 writer: &mut impl Write,
1594 lo: u8,
1595 hi: u8,
1596 offset: i8,
1597) -> io::Result<()> {
1598 if data.len() >= PARALLEL_THRESHOLD {
1600 let mut buf = alloc_uninit_vec(data.len());
1601 let n_threads = rayon::current_num_threads().max(1);
1602 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1603
1604 data.par_chunks(chunk_size)
1606 .zip(buf.par_chunks_mut(chunk_size))
1607 .for_each(|(src_chunk, dst_chunk)| {
1608 translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
1609 });
1610
1611 return writer.write_all(&buf);
1612 }
1613
1614 if data.len() <= SINGLE_WRITE_LIMIT {
1616 let mut buf = alloc_uninit_vec(data.len());
1617 translate_range_simd(data, &mut buf, lo, hi, offset);
1618 return writer.write_all(&buf);
1619 }
1620 let mut buf = alloc_uninit_vec(BUF_SIZE);
1622 for chunk in data.chunks(BUF_SIZE) {
1623 translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
1624 writer.write_all(&buf[..chunk.len()])?;
1625 }
1626 Ok(())
1627}
1628
1629fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
1631 if data.len() >= PARALLEL_THRESHOLD {
1633 let mut buf = alloc_uninit_vec(data.len());
1634 let n_threads = rayon::current_num_threads().max(1);
1635 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1636
1637 data.par_chunks(chunk_size)
1638 .zip(buf.par_chunks_mut(chunk_size))
1639 .for_each(|(src_chunk, dst_chunk)| {
1640 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
1641 });
1642
1643 return writer.write_all(&buf);
1644 }
1645
1646 if data.len() <= SINGLE_WRITE_LIMIT {
1648 let mut buf = alloc_uninit_vec(data.len());
1649 translate_to(data, &mut buf, table);
1650 return writer.write_all(&buf);
1651 }
1652 let mut buf = alloc_uninit_vec(BUF_SIZE);
1653 for chunk in data.chunks(BUF_SIZE) {
1654 translate_to(chunk, &mut buf[..chunk.len()], table);
1655 writer.write_all(&buf[..chunk.len()])?;
1656 }
1657 Ok(())
1658}
1659
1660pub fn translate_squeeze_mmap(
1666 set1: &[u8],
1667 set2: &[u8],
1668 data: &[u8],
1669 writer: &mut impl Write,
1670) -> io::Result<()> {
1671 let table = build_translate_table(set1, set2);
1672 let squeeze_set = build_member_set(set2);
1673
1674 if data.len() >= PARALLEL_THRESHOLD {
1679 let mut translated = alloc_uninit_vec(data.len());
1681 let range_info = detect_range_offset(&table);
1682 let n_threads = rayon::current_num_threads().max(1);
1683 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1684
1685 if let Some((lo, hi, offset)) = range_info {
1686 data.par_chunks(chunk_size)
1687 .zip(translated.par_chunks_mut(chunk_size))
1688 .for_each(|(src_chunk, dst_chunk)| {
1689 translate_range_simd(
1690 src_chunk,
1691 &mut dst_chunk[..src_chunk.len()],
1692 lo,
1693 hi,
1694 offset,
1695 );
1696 });
1697 } else {
1698 data.par_chunks(chunk_size)
1699 .zip(translated.par_chunks_mut(chunk_size))
1700 .for_each(|(src_chunk, dst_chunk)| {
1701 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
1702 });
1703 }
1704
1705 let mut last_squeezed: u16 = 256;
1709 let len = translated.len();
1710 let mut wp = 0;
1711 unsafe {
1712 let ptr = translated.as_mut_ptr();
1713 let mut i = 0;
1714 while i < len {
1715 let b = *ptr.add(i);
1716 if is_member(&squeeze_set, b) {
1717 if last_squeezed == b as u16 {
1718 i += 1;
1719 continue;
1720 }
1721 last_squeezed = b as u16;
1722 } else {
1723 last_squeezed = 256;
1724 }
1725 *ptr.add(wp) = b;
1726 wp += 1;
1727 i += 1;
1728 }
1729 }
1730 return writer.write_all(&translated[..wp]);
1731 }
1732
1733 if data.len() <= SINGLE_WRITE_LIMIT {
1735 let mut buf: Vec<u8> = Vec::with_capacity(data.len());
1736 let mut last_squeezed: u16 = 256;
1737 unsafe {
1738 buf.set_len(data.len());
1739 let outp: *mut u8 = buf.as_mut_ptr();
1740 let inp = data.as_ptr();
1741 let len = data.len();
1742 let mut wp = 0;
1743 let mut i = 0;
1744 while i < len {
1745 let translated = *table.get_unchecked(*inp.add(i) as usize);
1746 if is_member(&squeeze_set, translated) {
1747 if last_squeezed == translated as u16 {
1748 i += 1;
1749 continue;
1750 }
1751 last_squeezed = translated as u16;
1752 } else {
1753 last_squeezed = 256;
1754 }
1755 *outp.add(wp) = translated;
1756 wp += 1;
1757 i += 1;
1758 }
1759 buf.set_len(wp);
1760 }
1761 return writer.write_all(&buf);
1762 }
1763
1764 let buf_size = data.len().min(BUF_SIZE);
1766 let mut buf = vec![0u8; buf_size];
1767 let mut last_squeezed: u16 = 256;
1768
1769 for chunk in data.chunks(buf_size) {
1770 translate_to(chunk, &mut buf[..chunk.len()], &table);
1771 let mut wp = 0;
1772 unsafe {
1773 let ptr = buf.as_mut_ptr();
1774 for i in 0..chunk.len() {
1775 let b = *ptr.add(i);
1776 if is_member(&squeeze_set, b) {
1777 if last_squeezed == b as u16 {
1778 continue;
1779 }
1780 last_squeezed = b as u16;
1781 } else {
1782 last_squeezed = 256;
1783 }
1784 *ptr.add(wp) = b;
1785 wp += 1;
1786 }
1787 }
1788 writer.write_all(&buf[..wp])?;
1789 }
1790 Ok(())
1791}
1792
1793pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1799 if delete_chars.len() == 1 {
1800 return delete_single_char_mmap(delete_chars[0], data, writer);
1801 }
1802 if delete_chars.len() <= 3 {
1803 return delete_multi_memchr_mmap(delete_chars, data, writer);
1804 }
1805
1806 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1808 return delete_range_mmap(data, writer, lo, hi);
1809 }
1810
1811 let member = build_member_set(delete_chars);
1812
1813 if data.len() >= PARALLEL_THRESHOLD {
1817 let n_threads = rayon::current_num_threads().max(1);
1818 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1819
1820 let mut outbuf = alloc_uninit_vec(data.len());
1822 let chunk_lens: Vec<usize> = data
1823 .par_chunks(chunk_size)
1824 .zip(outbuf.par_chunks_mut(chunk_size))
1825 .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
1826 .collect();
1827
1828 let mut write_pos = 0;
1832 let mut src_offset = 0;
1833 for &clen in &chunk_lens {
1834 if clen > 0 && src_offset != write_pos {
1835 unsafe {
1836 std::ptr::copy(
1837 outbuf.as_ptr().add(src_offset),
1838 outbuf.as_mut_ptr().add(write_pos),
1839 clen,
1840 );
1841 }
1842 }
1843 write_pos += clen;
1844 src_offset += chunk_size;
1845 }
1846
1847 return writer.write_all(&outbuf[..write_pos]);
1848 }
1849
1850 if data.len() <= SINGLE_WRITE_LIMIT {
1852 let mut outbuf = alloc_uninit_vec(data.len());
1853 let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
1854 return writer.write_all(&outbuf[..out_pos]);
1855 }
1856
1857 let buf_size = data.len().min(BUF_SIZE);
1859 let mut outbuf = alloc_uninit_vec(buf_size);
1860
1861 for chunk in data.chunks(buf_size) {
1862 let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
1863 writer.write_all(&outbuf[..out_pos])?;
1864 }
1865 Ok(())
1866}
1867
1868fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
1870 if data.len() >= PARALLEL_THRESHOLD {
1872 let n_threads = rayon::current_num_threads().max(1);
1873 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1874
1875 let results: Vec<Vec<u8>> = data
1876 .par_chunks(chunk_size)
1877 .map(|chunk| {
1878 let mut out = alloc_uninit_vec(chunk.len());
1879 let wp = delete_range_chunk(chunk, &mut out, lo, hi);
1880 unsafe { out.set_len(wp) };
1881 out
1882 })
1883 .collect();
1884
1885 let slices: Vec<std::io::IoSlice> = results
1886 .iter()
1887 .filter(|r| !r.is_empty())
1888 .map(|r| std::io::IoSlice::new(r))
1889 .collect();
1890 return write_ioslices(writer, &slices);
1891 }
1892
1893 if data.len() <= SINGLE_WRITE_LIMIT {
1895 let mut outbuf = alloc_uninit_vec(data.len());
1896 let wp = delete_range_chunk(data, &mut outbuf, lo, hi);
1897 return writer.write_all(&outbuf[..wp]);
1898 }
1899
1900 let mut outbuf = alloc_uninit_vec(BUF_SIZE);
1902 for chunk in data.chunks(BUF_SIZE) {
1903 let wp = delete_range_chunk(chunk, &mut outbuf, lo, hi);
1904 writer.write_all(&outbuf[..wp])?;
1905 }
1906 Ok(())
1907}
1908
1909#[inline]
1912fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
1913 let len = chunk.len();
1914 let mut out_pos = 0;
1915 let mut i = 0;
1916
1917 while i + 8 <= len {
1918 unsafe {
1919 let b0 = *chunk.get_unchecked(i);
1920 let b1 = *chunk.get_unchecked(i + 1);
1921 let b2 = *chunk.get_unchecked(i + 2);
1922 let b3 = *chunk.get_unchecked(i + 3);
1923 let b4 = *chunk.get_unchecked(i + 4);
1924 let b5 = *chunk.get_unchecked(i + 5);
1925 let b6 = *chunk.get_unchecked(i + 6);
1926 let b7 = *chunk.get_unchecked(i + 7);
1927
1928 *outbuf.get_unchecked_mut(out_pos) = b0;
1929 out_pos += !is_member(member, b0) as usize;
1930 *outbuf.get_unchecked_mut(out_pos) = b1;
1931 out_pos += !is_member(member, b1) as usize;
1932 *outbuf.get_unchecked_mut(out_pos) = b2;
1933 out_pos += !is_member(member, b2) as usize;
1934 *outbuf.get_unchecked_mut(out_pos) = b3;
1935 out_pos += !is_member(member, b3) as usize;
1936 *outbuf.get_unchecked_mut(out_pos) = b4;
1937 out_pos += !is_member(member, b4) as usize;
1938 *outbuf.get_unchecked_mut(out_pos) = b5;
1939 out_pos += !is_member(member, b5) as usize;
1940 *outbuf.get_unchecked_mut(out_pos) = b6;
1941 out_pos += !is_member(member, b6) as usize;
1942 *outbuf.get_unchecked_mut(out_pos) = b7;
1943 out_pos += !is_member(member, b7) as usize;
1944 }
1945 i += 8;
1946 }
1947
1948 while i < len {
1949 unsafe {
1950 let b = *chunk.get_unchecked(i);
1951 *outbuf.get_unchecked_mut(out_pos) = b;
1952 out_pos += !is_member(member, b) as usize;
1953 }
1954 i += 1;
1955 }
1956
1957 out_pos
1958}
1959
1960fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1961 if data.len() >= PARALLEL_THRESHOLD {
1964 let n_threads = rayon::current_num_threads().max(1);
1965 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1966
1967 let results: Vec<Vec<u8>> = data
1968 .par_chunks(chunk_size)
1969 .map(|chunk| {
1970 let mut out = Vec::with_capacity(chunk.len());
1971 let mut last = 0;
1972 for pos in memchr::memchr_iter(ch, chunk) {
1973 if pos > last {
1974 out.extend_from_slice(&chunk[last..pos]);
1975 }
1976 last = pos + 1;
1977 }
1978 if last < chunk.len() {
1979 out.extend_from_slice(&chunk[last..]);
1980 }
1981 out
1982 })
1983 .collect();
1984
1985 let slices: Vec<std::io::IoSlice> = results
1987 .iter()
1988 .filter(|r| !r.is_empty())
1989 .map(|r| std::io::IoSlice::new(r))
1990 .collect();
1991 return write_ioslices(writer, &slices);
1992 }
1993
1994 if data.len() <= SINGLE_WRITE_LIMIT {
1996 let mut outbuf = Vec::with_capacity(data.len());
1997 let mut last = 0;
1998 for pos in memchr::memchr_iter(ch, data) {
1999 if pos > last {
2000 outbuf.extend_from_slice(&data[last..pos]);
2001 }
2002 last = pos + 1;
2003 }
2004 if last < data.len() {
2005 outbuf.extend_from_slice(&data[last..]);
2006 }
2007 return writer.write_all(&outbuf);
2008 }
2009
2010 let buf_size = data.len().min(BUF_SIZE);
2012 let mut outbuf = vec![0u8; buf_size];
2013
2014 for chunk in data.chunks(buf_size) {
2015 let mut wp = 0;
2016 let mut last = 0;
2017 for pos in memchr::memchr_iter(ch, chunk) {
2018 if pos > last {
2019 let run = pos - last;
2020 outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2021 wp += run;
2022 }
2023 last = pos + 1;
2024 }
2025 if last < chunk.len() {
2026 let run = chunk.len() - last;
2027 outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2028 wp += run;
2029 }
2030 writer.write_all(&outbuf[..wp])?;
2031 }
2032 Ok(())
2033}
2034
2035fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2036 let c0 = chars[0];
2037 let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
2038 let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
2039 let is_three = chars.len() >= 3;
2040
2041 if data.len() >= PARALLEL_THRESHOLD {
2043 let n_threads = rayon::current_num_threads().max(1);
2044 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2045
2046 let results: Vec<Vec<u8>> = data
2047 .par_chunks(chunk_size)
2048 .map(|chunk| {
2049 let mut out = Vec::with_capacity(chunk.len());
2050 let mut last = 0;
2051 if is_three {
2052 for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2053 if pos > last {
2054 out.extend_from_slice(&chunk[last..pos]);
2055 }
2056 last = pos + 1;
2057 }
2058 } else {
2059 for pos in memchr::memchr2_iter(c0, c1, chunk) {
2060 if pos > last {
2061 out.extend_from_slice(&chunk[last..pos]);
2062 }
2063 last = pos + 1;
2064 }
2065 }
2066 if last < chunk.len() {
2067 out.extend_from_slice(&chunk[last..]);
2068 }
2069 out
2070 })
2071 .collect();
2072
2073 let slices: Vec<std::io::IoSlice> = results
2075 .iter()
2076 .filter(|r| !r.is_empty())
2077 .map(|r| std::io::IoSlice::new(r))
2078 .collect();
2079 return write_ioslices(writer, &slices);
2080 }
2081
2082 if data.len() <= SINGLE_WRITE_LIMIT {
2084 let mut outbuf = Vec::with_capacity(data.len());
2085 let mut last = 0;
2086 if is_three {
2087 for pos in memchr::memchr3_iter(c0, c1, c2, data) {
2088 if pos > last {
2089 outbuf.extend_from_slice(&data[last..pos]);
2090 }
2091 last = pos + 1;
2092 }
2093 } else {
2094 for pos in memchr::memchr2_iter(c0, c1, data) {
2095 if pos > last {
2096 outbuf.extend_from_slice(&data[last..pos]);
2097 }
2098 last = pos + 1;
2099 }
2100 }
2101 if last < data.len() {
2102 outbuf.extend_from_slice(&data[last..]);
2103 }
2104 return writer.write_all(&outbuf);
2105 }
2106
2107 let buf_size = data.len().min(BUF_SIZE);
2109 let mut outbuf = vec![0u8; buf_size];
2110
2111 for chunk in data.chunks(buf_size) {
2112 let mut wp = 0;
2113 let mut last = 0;
2114
2115 if is_three {
2118 for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2119 if pos > last {
2120 let run = pos - last;
2121 outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2122 wp += run;
2123 }
2124 last = pos + 1;
2125 }
2126 } else {
2127 for pos in memchr::memchr2_iter(c0, c1, chunk) {
2128 if pos > last {
2129 let run = pos - last;
2130 outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2131 wp += run;
2132 }
2133 last = pos + 1;
2134 }
2135 }
2136
2137 if last < chunk.len() {
2138 let run = chunk.len() - last;
2139 outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2140 wp += run;
2141 }
2142 writer.write_all(&outbuf[..wp])?;
2143 }
2144 Ok(())
2145}
2146
2147pub fn delete_squeeze_mmap(
2152 delete_chars: &[u8],
2153 squeeze_chars: &[u8],
2154 data: &[u8],
2155 writer: &mut impl Write,
2156) -> io::Result<()> {
2157 let delete_set = build_member_set(delete_chars);
2158 let squeeze_set = build_member_set(squeeze_chars);
2159
2160 if data.len() <= SINGLE_WRITE_LIMIT {
2162 let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2163 let mut last_squeezed: u16 = 256;
2164 unsafe {
2165 outbuf.set_len(data.len());
2166 let outp: *mut u8 = outbuf.as_mut_ptr();
2167 let inp = data.as_ptr();
2168 let len = data.len();
2169 let mut out_pos = 0;
2170 let mut i = 0;
2171 while i < len {
2172 let b = *inp.add(i);
2173 if is_member(&delete_set, b) {
2174 i += 1;
2175 continue;
2176 }
2177 if is_member(&squeeze_set, b) {
2178 if last_squeezed == b as u16 {
2179 i += 1;
2180 continue;
2181 }
2182 last_squeezed = b as u16;
2183 } else {
2184 last_squeezed = 256;
2185 }
2186 *outp.add(out_pos) = b;
2187 out_pos += 1;
2188 i += 1;
2189 }
2190 outbuf.set_len(out_pos);
2191 }
2192 return writer.write_all(&outbuf);
2193 }
2194
2195 let buf_size = data.len().min(BUF_SIZE);
2197 let mut outbuf = vec![0u8; buf_size];
2198 let mut last_squeezed: u16 = 256;
2199
2200 for chunk in data.chunks(buf_size) {
2201 let mut out_pos = 0;
2202 for &b in chunk {
2203 if is_member(&delete_set, b) {
2204 continue;
2205 }
2206 if is_member(&squeeze_set, b) {
2207 if last_squeezed == b as u16 {
2208 continue;
2209 }
2210 last_squeezed = b as u16;
2211 } else {
2212 last_squeezed = 256;
2213 }
2214 unsafe {
2215 *outbuf.get_unchecked_mut(out_pos) = b;
2216 }
2217 out_pos += 1;
2218 }
2219 writer.write_all(&outbuf[..out_pos])?;
2220 }
2221 Ok(())
2222}
2223
2224pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2230 if squeeze_chars.len() == 1 {
2231 return squeeze_single_mmap(squeeze_chars[0], data, writer);
2232 }
2233 if squeeze_chars.len() == 2 {
2234 return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
2235 }
2236 if squeeze_chars.len() == 3 {
2237 return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
2238 }
2239
2240 let member = build_member_set(squeeze_chars);
2241
2242 if data.len() >= PARALLEL_THRESHOLD {
2244 let n_threads = rayon::current_num_threads().max(1);
2245 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2246
2247 let results: Vec<Vec<u8>> = data
2248 .par_chunks(chunk_size)
2249 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2250 .collect();
2251
2252 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2257 for (idx, result) in results.iter().enumerate() {
2258 if result.is_empty() {
2259 continue;
2260 }
2261 if idx > 0 {
2262 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2264 if is_member(&member, prev_last) {
2265 let skip = result.iter().take_while(|&&b| b == prev_last).count();
2267 if skip < result.len() {
2268 slices.push(std::io::IoSlice::new(&result[skip..]));
2269 }
2270 continue;
2271 }
2272 }
2273 }
2274 slices.push(std::io::IoSlice::new(result));
2275 }
2276 return write_ioslices(writer, &slices);
2277 }
2278
2279 if data.len() <= SINGLE_WRITE_LIMIT {
2281 let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2282 let mut last_squeezed: u16 = 256;
2283 let len = data.len();
2284 let mut wp = 0;
2285 let mut i = 0;
2286
2287 unsafe {
2288 outbuf.set_len(data.len());
2289 let inp = data.as_ptr();
2290 let outp: *mut u8 = outbuf.as_mut_ptr();
2291
2292 while i < len {
2293 let b = *inp.add(i);
2294 if is_member(&member, b) {
2295 if last_squeezed != b as u16 {
2296 *outp.add(wp) = b;
2297 wp += 1;
2298 last_squeezed = b as u16;
2299 }
2300 i += 1;
2301 while i < len && *inp.add(i) == b {
2302 i += 1;
2303 }
2304 } else {
2305 last_squeezed = 256;
2306 *outp.add(wp) = b;
2307 wp += 1;
2308 i += 1;
2309 }
2310 }
2311 outbuf.set_len(wp);
2312 }
2313 return writer.write_all(&outbuf);
2314 }
2315
2316 let buf_size = data.len().min(BUF_SIZE);
2318 let mut outbuf = vec![0u8; buf_size];
2319 let mut last_squeezed: u16 = 256;
2320
2321 for chunk in data.chunks(buf_size) {
2322 let len = chunk.len();
2323 let mut wp = 0;
2324 let mut i = 0;
2325
2326 unsafe {
2327 let inp = chunk.as_ptr();
2328 let outp = outbuf.as_mut_ptr();
2329
2330 while i < len {
2331 let b = *inp.add(i);
2332 if is_member(&member, b) {
2333 if last_squeezed != b as u16 {
2334 *outp.add(wp) = b;
2335 wp += 1;
2336 last_squeezed = b as u16;
2337 }
2338 i += 1;
2339 while i < len && *inp.add(i) == b {
2340 i += 1;
2341 }
2342 } else {
2343 last_squeezed = 256;
2344 *outp.add(wp) = b;
2345 wp += 1;
2346 i += 1;
2347 }
2348 }
2349 }
2350 writer.write_all(&outbuf[..wp])?;
2351 }
2352 Ok(())
2353}
2354
2355fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
2357 let len = chunk.len();
2358 let mut out = Vec::with_capacity(len);
2359 let mut last_squeezed: u16 = 256;
2360 let mut i = 0;
2361
2362 unsafe {
2363 out.set_len(len);
2364 let inp = chunk.as_ptr();
2365 let outp: *mut u8 = out.as_mut_ptr();
2366 let mut wp = 0;
2367
2368 while i < len {
2369 let b = *inp.add(i);
2370 if is_member(member, b) {
2371 if last_squeezed != b as u16 {
2372 *outp.add(wp) = b;
2373 wp += 1;
2374 last_squeezed = b as u16;
2375 }
2376 i += 1;
2377 while i < len && *inp.add(i) == b {
2378 i += 1;
2379 }
2380 } else {
2381 last_squeezed = 256;
2382 *outp.add(wp) = b;
2383 wp += 1;
2384 i += 1;
2385 }
2386 }
2387 out.set_len(wp);
2388 }
2389 out
2390}
2391
2392fn squeeze_multi_mmap<const N: usize>(
2393 chars: &[u8],
2394 data: &[u8],
2395 writer: &mut impl Write,
2396) -> io::Result<()> {
2397 if data.len() >= PARALLEL_THRESHOLD {
2399 let member = build_member_set(chars);
2400 let n_threads = rayon::current_num_threads().max(1);
2401 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2402
2403 let results: Vec<Vec<u8>> = data
2404 .par_chunks(chunk_size)
2405 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2406 .collect();
2407
2408 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2410 for (idx, result) in results.iter().enumerate() {
2411 if result.is_empty() {
2412 continue;
2413 }
2414 if idx > 0 {
2415 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2416 if is_member(&member, prev_last) {
2417 let skip = result.iter().take_while(|&&b| b == prev_last).count();
2418 if skip < result.len() {
2419 slices.push(std::io::IoSlice::new(&result[skip..]));
2420 }
2421 continue;
2422 }
2423 }
2424 }
2425 slices.push(std::io::IoSlice::new(result));
2426 }
2427 return write_ioslices(writer, &slices);
2428 }
2429
2430 let buf_size = data.len().min(BUF_SIZE);
2431 let mut outbuf = vec![0u8; buf_size];
2432 let mut wp = 0;
2433 let mut last_squeezed: u16 = 256;
2434 let mut cursor = 0;
2435
2436 macro_rules! find_next {
2437 ($data:expr) => {
2438 if N == 2 {
2439 memchr::memchr2(chars[0], chars[1], $data)
2440 } else {
2441 memchr::memchr3(chars[0], chars[1], chars[2], $data)
2442 }
2443 };
2444 }
2445
2446 macro_rules! flush_and_copy {
2447 ($src:expr, $len:expr) => {
2448 if wp + $len > buf_size {
2449 writer.write_all(&outbuf[..wp])?;
2450 wp = 0;
2451 }
2452 if $len > buf_size {
2453 writer.write_all($src)?;
2454 } else {
2455 outbuf[wp..wp + $len].copy_from_slice($src);
2456 wp += $len;
2457 }
2458 };
2459 }
2460
2461 while cursor < data.len() {
2462 match find_next!(&data[cursor..]) {
2463 Some(offset) => {
2464 let pos = cursor + offset;
2465 let b = data[pos];
2466 if pos > cursor {
2467 let span = pos - cursor;
2468 flush_and_copy!(&data[cursor..pos], span);
2469 last_squeezed = 256;
2470 }
2471 if last_squeezed != b as u16 {
2472 if wp >= buf_size {
2473 writer.write_all(&outbuf[..wp])?;
2474 wp = 0;
2475 }
2476 outbuf[wp] = b;
2477 wp += 1;
2478 last_squeezed = b as u16;
2479 }
2480 let mut skip = pos + 1;
2481 while skip < data.len() && data[skip] == b {
2482 skip += 1;
2483 }
2484 cursor = skip;
2485 }
2486 None => {
2487 let remaining = data.len() - cursor;
2488 flush_and_copy!(&data[cursor..], remaining);
2489 break;
2490 }
2491 }
2492 }
2493 if wp > 0 {
2494 writer.write_all(&outbuf[..wp])?;
2495 }
2496 Ok(())
2497}
2498
2499fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2500 if data.is_empty() {
2501 return Ok(());
2502 }
2503
2504 if memchr::memmem::find(data, &[ch, ch]).is_none() {
2505 return writer.write_all(data);
2506 }
2507
2508 if data.len() >= PARALLEL_THRESHOLD {
2510 let n_threads = rayon::current_num_threads().max(1);
2511 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2512
2513 let results: Vec<Vec<u8>> = data
2514 .par_chunks(chunk_size)
2515 .map(|chunk| {
2516 let mut out = Vec::with_capacity(chunk.len());
2517 let mut cursor = 0;
2518 while cursor < chunk.len() {
2519 match memchr::memchr(ch, &chunk[cursor..]) {
2520 Some(offset) => {
2521 let pos = cursor + offset;
2522 if pos > cursor {
2523 out.extend_from_slice(&chunk[cursor..pos]);
2524 }
2525 out.push(ch);
2526 cursor = pos + 1;
2527 while cursor < chunk.len() && chunk[cursor] == ch {
2528 cursor += 1;
2529 }
2530 }
2531 None => {
2532 out.extend_from_slice(&chunk[cursor..]);
2533 break;
2534 }
2535 }
2536 }
2537 out
2538 })
2539 .collect();
2540
2541 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2544 for (idx, result) in results.iter().enumerate() {
2545 if result.is_empty() {
2546 continue;
2547 }
2548 if idx > 0 {
2549 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2550 if prev_last == ch {
2551 let skip = result.iter().take_while(|&&b| b == ch).count();
2553 if skip < result.len() {
2554 slices.push(std::io::IoSlice::new(&result[skip..]));
2555 }
2556 continue;
2557 }
2558 }
2559 }
2560 slices.push(std::io::IoSlice::new(result));
2561 }
2562 return write_ioslices(writer, &slices);
2563 }
2564
2565 let buf_size = data.len().min(BUF_SIZE);
2566 let mut outbuf = vec![0u8; buf_size];
2567 let len = data.len();
2568 let mut wp = 0;
2569 let mut cursor = 0;
2570
2571 while cursor < len {
2572 match memchr::memchr(ch, &data[cursor..]) {
2573 Some(offset) => {
2574 let pos = cursor + offset;
2575 let gap = pos - cursor;
2576 if gap > 0 {
2577 if wp + gap > buf_size {
2578 writer.write_all(&outbuf[..wp])?;
2579 wp = 0;
2580 }
2581 if gap > buf_size {
2582 writer.write_all(&data[cursor..pos])?;
2583 } else {
2584 outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
2585 wp += gap;
2586 }
2587 }
2588 if wp >= buf_size {
2589 writer.write_all(&outbuf[..wp])?;
2590 wp = 0;
2591 }
2592 outbuf[wp] = ch;
2593 wp += 1;
2594 cursor = pos + 1;
2595 while cursor < len && data[cursor] == ch {
2596 cursor += 1;
2597 }
2598 }
2599 None => {
2600 let remaining = len - cursor;
2601 if remaining > 0 {
2602 if wp + remaining > buf_size {
2603 writer.write_all(&outbuf[..wp])?;
2604 wp = 0;
2605 }
2606 if remaining > buf_size {
2607 writer.write_all(&data[cursor..])?;
2608 } else {
2609 outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
2610 wp += remaining;
2611 }
2612 }
2613 break;
2614 }
2615 }
2616 }
2617
2618 if wp > 0 {
2619 writer.write_all(&outbuf[..wp])?;
2620 }
2621 Ok(())
2622}