1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const MAX_IOV: usize = 1024;
8
9const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12const STREAM_BUF: usize = 16 * 1024 * 1024;
19
20const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
23
24#[inline]
27fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
28 if slices.is_empty() {
29 return Ok(());
30 }
31 for batch in slices.chunks(MAX_IOV) {
32 let total: usize = batch.iter().map(|s| s.len()).sum();
33 match writer.write_vectored(batch) {
34 Ok(n) if n >= total => continue,
35 Ok(mut written) => {
36 for slice in batch {
38 let slen = slice.len();
39 if written >= slen {
40 written -= slen;
41 continue;
42 }
43 if written > 0 {
44 writer.write_all(&slice[written..])?;
45 written = 0;
46 } else {
47 writer.write_all(slice)?;
48 }
49 }
50 }
51 Err(e) => return Err(e),
52 }
53 }
54 Ok(())
55}
56
57#[inline]
60#[allow(clippy::uninit_vec)]
61fn alloc_uninit_vec(len: usize) -> Vec<u8> {
62 let mut v = Vec::with_capacity(len);
63 unsafe {
65 v.set_len(len);
66 }
67 v
68}
69
70#[inline]
72fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
73 let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
74 let last = set2.last().copied();
75 for (i, &from) in set1.iter().enumerate() {
76 table[from as usize] = if i < set2.len() {
77 set2[i]
78 } else {
79 last.unwrap_or(from)
80 };
81 }
82 table
83}
84
85#[inline]
87fn build_member_set(chars: &[u8]) -> [u8; 32] {
88 let mut set = [0u8; 32];
89 for &ch in chars {
90 set[ch as usize >> 3] |= 1 << (ch & 7);
91 }
92 set
93}
94
95#[inline(always)]
96fn is_member(set: &[u8; 32], ch: u8) -> bool {
97 unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
98}
99
100#[inline(always)]
104fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
105 #[cfg(target_arch = "x86_64")]
106 {
107 if is_x86_feature_detected!("avx2") {
108 unsafe { translate_inplace_avx2_table(data, table) };
109 return;
110 }
111 if is_x86_feature_detected!("ssse3") {
112 unsafe { translate_inplace_ssse3_table(data, table) };
113 return;
114 }
115 }
116 translate_inplace_scalar(data, table);
117}
118
119#[inline(always)]
121fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
122 let len = data.len();
123 let ptr = data.as_mut_ptr();
124 let mut i = 0;
125 unsafe {
126 while i + 8 <= len {
127 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
128 *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
129 *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
130 *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
131 *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
132 *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
133 *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
134 *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
135 i += 8;
136 }
137 while i < len {
138 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
139 i += 1;
140 }
141 }
142}
143
144#[cfg(target_arch = "x86_64")]
164#[target_feature(enable = "avx2")]
165unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
166 use std::arch::x86_64::*;
167
168 unsafe {
169 let len = data.len();
170 let ptr = data.as_mut_ptr();
171
172 let mut lut = [_mm256_setzero_si256(); 16];
176 for h in 0u8..16 {
177 let base = (h as usize) * 16;
178 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
179 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
181 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
182 }
183
184 let lo_mask = _mm256_set1_epi8(0x0F);
185
186 let mut i = 0;
187 while i + 32 <= len {
188 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
189 let lo_nibble = _mm256_and_si256(input, lo_mask);
190 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
191
192 let mut result = _mm256_setzero_si256();
195
196 macro_rules! do_nibble {
198 ($h:expr) => {
199 let h_val = _mm256_set1_epi8($h as i8);
200 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
201 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
202 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
203 };
204 }
205 do_nibble!(0);
206 do_nibble!(1);
207 do_nibble!(2);
208 do_nibble!(3);
209 do_nibble!(4);
210 do_nibble!(5);
211 do_nibble!(6);
212 do_nibble!(7);
213 do_nibble!(8);
214 do_nibble!(9);
215 do_nibble!(10);
216 do_nibble!(11);
217 do_nibble!(12);
218 do_nibble!(13);
219 do_nibble!(14);
220 do_nibble!(15);
221
222 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
223 i += 32;
224 }
225
226 if i + 16 <= len {
228 let lo_mask128 = _mm_set1_epi8(0x0F);
229
230 let mut lut128 = [_mm_setzero_si128(); 16];
232 for h in 0u8..16 {
233 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
234 }
235
236 let input = _mm_loadu_si128(ptr.add(i) as *const _);
237 let lo_nib = _mm_and_si128(input, lo_mask128);
238 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
239
240 let mut res = _mm_setzero_si128();
241 macro_rules! do_nibble128 {
242 ($h:expr) => {
243 let h_val = _mm_set1_epi8($h as i8);
244 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
245 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
246 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
247 };
248 }
249 do_nibble128!(0);
250 do_nibble128!(1);
251 do_nibble128!(2);
252 do_nibble128!(3);
253 do_nibble128!(4);
254 do_nibble128!(5);
255 do_nibble128!(6);
256 do_nibble128!(7);
257 do_nibble128!(8);
258 do_nibble128!(9);
259 do_nibble128!(10);
260 do_nibble128!(11);
261 do_nibble128!(12);
262 do_nibble128!(13);
263 do_nibble128!(14);
264 do_nibble128!(15);
265
266 _mm_storeu_si128(ptr.add(i) as *mut _, res);
267 i += 16;
268 }
269
270 while i < len {
272 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
273 i += 1;
274 }
275 }
276}
277
278#[cfg(target_arch = "x86_64")]
279#[target_feature(enable = "ssse3")]
280unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
281 use std::arch::x86_64::*;
282
283 unsafe {
284 let len = data.len();
285 let ptr = data.as_mut_ptr();
286
287 let mut lut = [_mm_setzero_si128(); 16];
289 for h in 0u8..16 {
290 let base = (h as usize) * 16;
291 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
292 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
293 }
294
295 let lo_mask = _mm_set1_epi8(0x0F);
296
297 let mut i = 0;
298 while i + 16 <= len {
299 let input = _mm_loadu_si128(ptr.add(i) as *const _);
300 let lo_nibble = _mm_and_si128(input, lo_mask);
301 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
302
303 let mut result = _mm_setzero_si128();
304
305 macro_rules! do_nibble {
306 ($h:expr) => {
307 let h_val = _mm_set1_epi8($h as i8);
308 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
309 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
310 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
311 };
312 }
313 do_nibble!(0);
314 do_nibble!(1);
315 do_nibble!(2);
316 do_nibble!(3);
317 do_nibble!(4);
318 do_nibble!(5);
319 do_nibble!(6);
320 do_nibble!(7);
321 do_nibble!(8);
322 do_nibble!(9);
323 do_nibble!(10);
324 do_nibble!(11);
325 do_nibble!(12);
326 do_nibble!(13);
327 do_nibble!(14);
328 do_nibble!(15);
329
330 _mm_storeu_si128(ptr.add(i) as *mut _, result);
331 i += 16;
332 }
333
334 while i < len {
336 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
337 i += 1;
338 }
339 }
340}
341
342#[inline(always)]
345fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
346 debug_assert!(dst.len() >= src.len());
347 #[cfg(target_arch = "x86_64")]
348 {
349 if is_x86_feature_detected!("avx2") {
350 unsafe { translate_to_avx2_table(src, dst, table) };
351 return;
352 }
353 if is_x86_feature_detected!("ssse3") {
354 unsafe { translate_to_ssse3_table(src, dst, table) };
355 return;
356 }
357 }
358 translate_to_scalar(src, dst, table);
359}
360
361#[inline(always)]
363fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
364 unsafe {
365 let sp = src.as_ptr();
366 let dp = dst.as_mut_ptr();
367 let len = src.len();
368 let mut i = 0;
369 while i + 8 <= len {
370 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
371 *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
372 *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
373 *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
374 *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
375 *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
376 *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
377 *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
378 i += 8;
379 }
380 while i < len {
381 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
382 i += 1;
383 }
384 }
385}
386
387#[cfg(target_arch = "x86_64")]
388#[target_feature(enable = "avx2")]
389unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
390 use std::arch::x86_64::*;
391
392 unsafe {
393 let len = src.len();
394 let sp = src.as_ptr();
395 let dp = dst.as_mut_ptr();
396
397 let mut lut = [_mm256_setzero_si256(); 16];
399 for h in 0u8..16 {
400 let base = (h as usize) * 16;
401 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
402 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
403 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
404 }
405
406 let lo_mask = _mm256_set1_epi8(0x0F);
407
408 let mut i = 0;
409 while i + 32 <= len {
410 let input = _mm256_loadu_si256(sp.add(i) as *const _);
411 let lo_nibble = _mm256_and_si256(input, lo_mask);
412 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
413
414 let mut result = _mm256_setzero_si256();
415
416 macro_rules! do_nibble {
417 ($h:expr) => {
418 let h_val = _mm256_set1_epi8($h as i8);
419 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
420 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
421 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
422 };
423 }
424 do_nibble!(0);
425 do_nibble!(1);
426 do_nibble!(2);
427 do_nibble!(3);
428 do_nibble!(4);
429 do_nibble!(5);
430 do_nibble!(6);
431 do_nibble!(7);
432 do_nibble!(8);
433 do_nibble!(9);
434 do_nibble!(10);
435 do_nibble!(11);
436 do_nibble!(12);
437 do_nibble!(13);
438 do_nibble!(14);
439 do_nibble!(15);
440
441 _mm256_storeu_si256(dp.add(i) as *mut _, result);
442 i += 32;
443 }
444
445 if i + 16 <= len {
447 let lo_mask128 = _mm_set1_epi8(0x0F);
448 let mut lut128 = [_mm_setzero_si128(); 16];
449 for h in 0u8..16 {
450 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
451 }
452
453 let input = _mm_loadu_si128(sp.add(i) as *const _);
454 let lo_nib = _mm_and_si128(input, lo_mask128);
455 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
456
457 let mut res = _mm_setzero_si128();
458 macro_rules! do_nibble128 {
459 ($h:expr) => {
460 let h_val = _mm_set1_epi8($h as i8);
461 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
462 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
463 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
464 };
465 }
466 do_nibble128!(0);
467 do_nibble128!(1);
468 do_nibble128!(2);
469 do_nibble128!(3);
470 do_nibble128!(4);
471 do_nibble128!(5);
472 do_nibble128!(6);
473 do_nibble128!(7);
474 do_nibble128!(8);
475 do_nibble128!(9);
476 do_nibble128!(10);
477 do_nibble128!(11);
478 do_nibble128!(12);
479 do_nibble128!(13);
480 do_nibble128!(14);
481 do_nibble128!(15);
482
483 _mm_storeu_si128(dp.add(i) as *mut _, res);
484 i += 16;
485 }
486
487 while i < len {
489 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
490 i += 1;
491 }
492 }
493}
494
495#[cfg(target_arch = "x86_64")]
496#[target_feature(enable = "ssse3")]
497unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
498 use std::arch::x86_64::*;
499
500 unsafe {
501 let len = src.len();
502 let sp = src.as_ptr();
503 let dp = dst.as_mut_ptr();
504
505 let mut lut = [_mm_setzero_si128(); 16];
506 for h in 0u8..16 {
507 let base = (h as usize) * 16;
508 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
509 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
510 }
511
512 let lo_mask = _mm_set1_epi8(0x0F);
513
514 let mut i = 0;
515 while i + 16 <= len {
516 let input = _mm_loadu_si128(sp.add(i) as *const _);
517 let lo_nibble = _mm_and_si128(input, lo_mask);
518 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
519
520 let mut result = _mm_setzero_si128();
521
522 macro_rules! do_nibble {
523 ($h:expr) => {
524 let h_val = _mm_set1_epi8($h as i8);
525 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
526 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
527 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
528 };
529 }
530 do_nibble!(0);
531 do_nibble!(1);
532 do_nibble!(2);
533 do_nibble!(3);
534 do_nibble!(4);
535 do_nibble!(5);
536 do_nibble!(6);
537 do_nibble!(7);
538 do_nibble!(8);
539 do_nibble!(9);
540 do_nibble!(10);
541 do_nibble!(11);
542 do_nibble!(12);
543 do_nibble!(13);
544 do_nibble!(14);
545 do_nibble!(15);
546
547 _mm_storeu_si128(dp.add(i) as *mut _, result);
548 i += 16;
549 }
550
551 while i < len {
553 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
554 i += 1;
555 }
556 }
557}
558
559#[inline]
567fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
568 let mut lo: Option<u8> = None;
569 let mut hi = 0u8;
570 let mut offset = 0i16;
571
572 for i in 0..256 {
573 if table[i] != i as u8 {
574 let diff = table[i] as i16 - i as i16;
575 match lo {
576 None => {
577 lo = Some(i as u8);
578 hi = i as u8;
579 offset = diff;
580 }
581 Some(_) => {
582 if diff != offset || i as u8 != hi.wrapping_add(1) {
583 return None;
584 }
585 hi = i as u8;
586 }
587 }
588 }
589 }
590
591 lo.map(|l| (l, hi, offset as i8))
592}
593
594#[cfg(target_arch = "x86_64")]
598fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
599 if is_x86_feature_detected!("avx2") {
600 unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
601 } else {
602 unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
603 }
604}
605
606#[cfg(target_arch = "x86_64")]
607#[target_feature(enable = "avx2")]
608unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
609 use std::arch::x86_64::*;
610
611 unsafe {
612 let range = hi - lo;
613 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
618 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
619 let offset_v = _mm256_set1_epi8(offset);
620 let zero = _mm256_setzero_si256();
621
622 let len = src.len();
623 let mut i = 0;
624
625 while i + 32 <= len {
626 let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
627 let biased = _mm256_add_epi8(input, bias_v);
628 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
630 let mask = _mm256_cmpeq_epi8(gt, zero);
632 let offset_masked = _mm256_and_si256(mask, offset_v);
633 let result = _mm256_add_epi8(input, offset_masked);
634 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
635 i += 32;
636 }
637
638 if i + 16 <= len {
640 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
641 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
642 let offset_v128 = _mm_set1_epi8(offset);
643 let zero128 = _mm_setzero_si128();
644
645 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
646 let biased = _mm_add_epi8(input, bias_v128);
647 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
648 let mask = _mm_cmpeq_epi8(gt, zero128);
649 let offset_masked = _mm_and_si128(mask, offset_v128);
650 let result = _mm_add_epi8(input, offset_masked);
651 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
652 i += 16;
653 }
654
655 while i < len {
657 let b = *src.get_unchecked(i);
658 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
659 b.wrapping_add(offset as u8)
660 } else {
661 b
662 };
663 i += 1;
664 }
665 }
666}
667
668#[cfg(target_arch = "x86_64")]
669#[target_feature(enable = "sse2")]
670unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
671 use std::arch::x86_64::*;
672
673 unsafe {
674 let range = hi - lo;
675 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
676 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
677 let offset_v = _mm_set1_epi8(offset);
678 let zero = _mm_setzero_si128();
679
680 let len = src.len();
681 let mut i = 0;
682
683 while i + 16 <= len {
684 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
685 let biased = _mm_add_epi8(input, bias_v);
686 let gt = _mm_cmpgt_epi8(biased, threshold_v);
687 let mask = _mm_cmpeq_epi8(gt, zero);
688 let offset_masked = _mm_and_si128(mask, offset_v);
689 let result = _mm_add_epi8(input, offset_masked);
690 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
691 i += 16;
692 }
693
694 while i < len {
695 let b = *src.get_unchecked(i);
696 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
697 b.wrapping_add(offset as u8)
698 } else {
699 b
700 };
701 i += 1;
702 }
703 }
704}
705
706#[cfg(not(target_arch = "x86_64"))]
708fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
709 for (i, &b) in src.iter().enumerate() {
710 dst[i] = if b >= lo && b <= hi {
711 b.wrapping_add(offset as u8)
712 } else {
713 b
714 };
715 }
716}
717
718#[cfg(target_arch = "x86_64")]
726fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
727 if is_x86_feature_detected!("avx2") {
728 unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
729 } else {
730 unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
731 }
732}
733
734#[cfg(target_arch = "x86_64")]
735#[target_feature(enable = "avx2")]
736unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
737 use std::arch::x86_64::*;
738
739 unsafe {
740 let range = hi - lo;
741 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
742 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
743 let offset_v = _mm256_set1_epi8(offset);
744 let zero = _mm256_setzero_si256();
745
746 let len = data.len();
747 let ptr = data.as_mut_ptr();
748 let mut i = 0;
749
750 while i + 32 <= len {
751 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
752 let biased = _mm256_add_epi8(input, bias_v);
753 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
754 let mask = _mm256_cmpeq_epi8(gt, zero);
755 let offset_masked = _mm256_and_si256(mask, offset_v);
756 let result = _mm256_add_epi8(input, offset_masked);
757 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
758 i += 32;
759 }
760
761 if i + 16 <= len {
762 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
763 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
764 let offset_v128 = _mm_set1_epi8(offset);
765 let zero128 = _mm_setzero_si128();
766
767 let input = _mm_loadu_si128(ptr.add(i) as *const _);
768 let biased = _mm_add_epi8(input, bias_v128);
769 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
770 let mask = _mm_cmpeq_epi8(gt, zero128);
771 let offset_masked = _mm_and_si128(mask, offset_v128);
772 let result = _mm_add_epi8(input, offset_masked);
773 _mm_storeu_si128(ptr.add(i) as *mut _, result);
774 i += 16;
775 }
776
777 while i < len {
778 let b = *ptr.add(i);
779 *ptr.add(i) = if b >= lo && b <= hi {
780 b.wrapping_add(offset as u8)
781 } else {
782 b
783 };
784 i += 1;
785 }
786 }
787}
788
789#[cfg(target_arch = "x86_64")]
790#[target_feature(enable = "sse2")]
791unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
792 use std::arch::x86_64::*;
793
794 unsafe {
795 let range = hi - lo;
796 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
797 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
798 let offset_v = _mm_set1_epi8(offset);
799 let zero = _mm_setzero_si128();
800
801 let len = data.len();
802 let ptr = data.as_mut_ptr();
803 let mut i = 0;
804
805 while i + 16 <= len {
806 let input = _mm_loadu_si128(ptr.add(i) as *const _);
807 let biased = _mm_add_epi8(input, bias_v);
808 let gt = _mm_cmpgt_epi8(biased, threshold_v);
809 let mask = _mm_cmpeq_epi8(gt, zero);
810 let offset_masked = _mm_and_si128(mask, offset_v);
811 let result = _mm_add_epi8(input, offset_masked);
812 _mm_storeu_si128(ptr.add(i) as *mut _, result);
813 i += 16;
814 }
815
816 while i < len {
817 let b = *ptr.add(i);
818 *ptr.add(i) = if b >= lo && b <= hi {
819 b.wrapping_add(offset as u8)
820 } else {
821 b
822 };
823 i += 1;
824 }
825 }
826}
827
828#[cfg(not(target_arch = "x86_64"))]
829fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
830 for b in data.iter_mut() {
831 if *b >= lo && *b <= hi {
832 *b = b.wrapping_add(offset as u8);
833 }
834 }
835}
836
837#[inline]
847fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
848 if chars.is_empty() {
849 return None;
850 }
851 let mut lo = chars[0];
852 let mut hi = chars[0];
853 for &c in &chars[1..] {
854 if c < lo {
855 lo = c;
856 }
857 if c > hi {
858 hi = c;
859 }
860 }
861 if (hi - lo + 1) as usize == chars.len() {
863 Some((lo, hi))
864 } else {
865 None
866 }
867}
868
869#[cfg(target_arch = "x86_64")]
873fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
874 if is_x86_feature_detected!("avx2") {
875 unsafe { delete_range_avx2(src, dst, lo, hi) }
876 } else {
877 unsafe { delete_range_sse2(src, dst, lo, hi) }
878 }
879}
880
881#[cfg(target_arch = "x86_64")]
882#[target_feature(enable = "avx2")]
883unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
884 use std::arch::x86_64::*;
885
886 unsafe {
887 let range = hi - lo;
888 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
889 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
890 let zero = _mm256_setzero_si256();
891
892 let len = src.len();
893 let sp = src.as_ptr();
894 let dp = dst.as_mut_ptr();
895 let mut ri = 0;
896 let mut wp = 0;
897
898 while ri + 32 <= len {
899 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
900 let biased = _mm256_add_epi8(input, bias_v);
901 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
903 let in_range = _mm256_cmpeq_epi8(gt, zero);
905 let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
907
908 if keep_mask == 0xFFFFFFFF {
909 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
911 wp += 32;
912 } else if keep_mask != 0 {
913 compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
915 let c0 = (keep_mask as u8).count_ones() as usize;
916 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
917 let c1 = ((keep_mask >> 8) as u8).count_ones() as usize;
918 compact_8bytes(
919 sp.add(ri + 16),
920 dp.add(wp + c0 + c1),
921 (keep_mask >> 16) as u8,
922 );
923 let c2 = ((keep_mask >> 16) as u8).count_ones() as usize;
924 compact_8bytes(
925 sp.add(ri + 24),
926 dp.add(wp + c0 + c1 + c2),
927 (keep_mask >> 24) as u8,
928 );
929 let c3 = ((keep_mask >> 24) as u8).count_ones() as usize;
930 wp += c0 + c1 + c2 + c3;
931 }
932 ri += 32;
934 }
935
936 if ri + 16 <= len {
938 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
939 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
940 let zero128 = _mm_setzero_si128();
941
942 let input = _mm_loadu_si128(sp.add(ri) as *const _);
943 let biased = _mm_add_epi8(input, bias_v128);
944 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
945 let in_range = _mm_cmpeq_epi8(gt, zero128);
946 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
947
948 if keep_mask == 0xFFFF {
949 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
950 wp += 16;
951 } else if keep_mask != 0 {
952 compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
953 let c0 = (keep_mask as u8).count_ones() as usize;
954 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
955 wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
956 }
957 ri += 16;
958 }
959
960 while ri < len {
962 let b = *sp.add(ri);
963 if b < lo || b > hi {
964 *dp.add(wp) = b;
965 wp += 1;
966 }
967 ri += 1;
968 }
969
970 wp
971 }
972}
973
974#[cfg(target_arch = "x86_64")]
979#[inline(always)]
980unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
981 unsafe {
986 let mut m = mask;
987 let mut w = 0;
988 while m != 0 {
989 let bit = m.trailing_zeros() as usize;
990 *dst.add(w) = *src.add(bit);
991 w += 1;
992 m &= m - 1;
993 }
994 }
995}
996
997#[cfg(target_arch = "x86_64")]
998#[target_feature(enable = "sse2")]
999unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1000 use std::arch::x86_64::*;
1001
1002 unsafe {
1003 let range = hi - lo;
1004 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1005 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1006 let zero = _mm_setzero_si128();
1007
1008 let len = src.len();
1009 let sp = src.as_ptr();
1010 let dp = dst.as_mut_ptr();
1011 let mut ri = 0;
1012 let mut wp = 0;
1013
1014 while ri + 16 <= len {
1015 let input = _mm_loadu_si128(sp.add(ri) as *const _);
1016 let biased = _mm_add_epi8(input, bias_v);
1017 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1018 let in_range = _mm_cmpeq_epi8(gt, zero);
1019 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
1020
1021 if keep_mask == 0xFFFF {
1022 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
1024 wp += 16;
1025 } else if keep_mask != 0 {
1026 compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
1027 let c0 = (keep_mask as u8).count_ones() as usize;
1028 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
1029 wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
1030 }
1031 ri += 16;
1032 }
1033
1034 while ri < len {
1035 let b = *sp.add(ri);
1036 if b < lo || b > hi {
1037 *dp.add(wp) = b;
1038 wp += 1;
1039 }
1040 ri += 1;
1041 }
1042
1043 wp
1044 }
1045}
1046
1047#[cfg(not(target_arch = "x86_64"))]
1049fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1050 let mut wp = 0;
1051 for &b in src {
1052 if b < lo || b > hi {
1053 dst[wp] = b;
1054 wp += 1;
1055 }
1056 }
1057 wp
1058}
1059
1060fn delete_range_streaming(
1065 lo: u8,
1066 hi: u8,
1067 reader: &mut impl Read,
1068 writer: &mut impl Write,
1069) -> io::Result<()> {
1070 let mut src = vec![0u8; STREAM_BUF];
1071 let mut dst = alloc_uninit_vec(STREAM_BUF);
1072 loop {
1073 let n = read_full(reader, &mut src)?;
1074 if n == 0 {
1075 break;
1076 }
1077 let wp = delete_range_chunk(&src[..n], &mut dst, lo, hi);
1078 if wp == n {
1079 writer.write_all(&src[..n])?;
1081 } else if wp > 0 {
1082 writer.write_all(&dst[..wp])?;
1083 }
1084 }
1085 Ok(())
1086}
1087
1088pub fn translate(
1093 set1: &[u8],
1094 set2: &[u8],
1095 reader: &mut impl Read,
1096 writer: &mut impl Write,
1097) -> io::Result<()> {
1098 let table = build_translate_table(set1, set2);
1099
1100 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1102 if is_identity {
1103 return passthrough_stream(reader, writer);
1104 }
1105
1106 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1108 return translate_range_stream(lo, hi, offset, reader, writer);
1109 }
1110
1111 let mut buf = vec![0u8; STREAM_BUF];
1119 loop {
1120 let n = read_full(reader, &mut buf)?;
1121 if n == 0 {
1122 break;
1123 }
1124 translate_inplace(&mut buf[..n], &table);
1125 writer.write_all(&buf[..n])?;
1126 }
1127 Ok(())
1128}
1129
1130fn translate_range_stream(
1133 lo: u8,
1134 hi: u8,
1135 offset: i8,
1136 reader: &mut impl Read,
1137 writer: &mut impl Write,
1138) -> io::Result<()> {
1139 let mut buf = vec![0u8; STREAM_BUF];
1140 loop {
1141 let n = read_full(reader, &mut buf)?;
1142 if n == 0 {
1143 break;
1144 }
1145 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1146 writer.write_all(&buf[..n])?;
1147 }
1148 Ok(())
1149}
1150
1151fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1154 let mut buf = vec![0u8; STREAM_BUF];
1155 loop {
1156 let n = read_full(reader, &mut buf)?;
1157 if n == 0 {
1158 break;
1159 }
1160 writer.write_all(&buf[..n])?;
1161 }
1162 Ok(())
1163}
1164
1165#[inline]
1168fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
1169 let n = reader.read(buf)?;
1171 if n == buf.len() || n == 0 {
1172 return Ok(n);
1173 }
1174 let mut total = n;
1176 while total < buf.len() {
1177 match reader.read(&mut buf[total..]) {
1178 Ok(0) => break,
1179 Ok(n) => total += n,
1180 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1181 Err(e) => return Err(e),
1182 }
1183 }
1184 Ok(total)
1185}
1186
1187pub fn translate_squeeze(
1188 set1: &[u8],
1189 set2: &[u8],
1190 reader: &mut impl Read,
1191 writer: &mut impl Write,
1192) -> io::Result<()> {
1193 let table = build_translate_table(set1, set2);
1194 let squeeze_set = build_member_set(set2);
1195
1196 let range_info = detect_range_offset(&table);
1202
1203 let mut buf = vec![0u8; STREAM_BUF];
1204 let mut last_squeezed: u16 = 256;
1205
1206 loop {
1207 let n = read_full(reader, &mut buf)?;
1208 if n == 0 {
1209 break;
1210 }
1211 if let Some((lo, hi, offset)) = range_info {
1213 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1214 } else {
1215 translate_inplace(&mut buf[..n], &table);
1216 }
1217 let mut wp = 0;
1219 unsafe {
1220 let ptr = buf.as_mut_ptr();
1221 for i in 0..n {
1222 let b = *ptr.add(i);
1223 if is_member(&squeeze_set, b) {
1224 if last_squeezed == b as u16 {
1225 continue;
1226 }
1227 last_squeezed = b as u16;
1228 } else {
1229 last_squeezed = 256;
1230 }
1231 *ptr.add(wp) = b;
1232 wp += 1;
1233 }
1234 }
1235 writer.write_all(&buf[..wp])?;
1236 }
1237 Ok(())
1238}
1239
1240pub fn delete(
1241 delete_chars: &[u8],
1242 reader: &mut impl Read,
1243 writer: &mut impl Write,
1244) -> io::Result<()> {
1245 if delete_chars.len() == 1 {
1246 return delete_single_streaming(delete_chars[0], reader, writer);
1247 }
1248 if delete_chars.len() <= 3 {
1249 return delete_multi_streaming(delete_chars, reader, writer);
1250 }
1251
1252 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1256 return delete_range_streaming(lo, hi, reader, writer);
1257 }
1258
1259 let member = build_member_set(delete_chars);
1260 let mut buf = vec![0u8; STREAM_BUF];
1261
1262 loop {
1263 let n = read_full(reader, &mut buf)?;
1264 if n == 0 {
1265 break;
1266 }
1267 let mut wp = 0;
1268 unsafe {
1269 let ptr = buf.as_mut_ptr();
1270 let mut i = 0;
1271 while i + 8 <= n {
1272 let b0 = *ptr.add(i);
1273 let b1 = *ptr.add(i + 1);
1274 let b2 = *ptr.add(i + 2);
1275 let b3 = *ptr.add(i + 3);
1276 let b4 = *ptr.add(i + 4);
1277 let b5 = *ptr.add(i + 5);
1278 let b6 = *ptr.add(i + 6);
1279 let b7 = *ptr.add(i + 7);
1280
1281 *ptr.add(wp) = b0;
1284 wp += !is_member(&member, b0) as usize;
1285 *ptr.add(wp) = b1;
1286 wp += !is_member(&member, b1) as usize;
1287 *ptr.add(wp) = b2;
1288 wp += !is_member(&member, b2) as usize;
1289 *ptr.add(wp) = b3;
1290 wp += !is_member(&member, b3) as usize;
1291 *ptr.add(wp) = b4;
1292 wp += !is_member(&member, b4) as usize;
1293 *ptr.add(wp) = b5;
1294 wp += !is_member(&member, b5) as usize;
1295 *ptr.add(wp) = b6;
1296 wp += !is_member(&member, b6) as usize;
1297 *ptr.add(wp) = b7;
1298 wp += !is_member(&member, b7) as usize;
1299 i += 8;
1300 }
1301 while i < n {
1302 let b = *ptr.add(i);
1303 *ptr.add(wp) = b;
1304 wp += !is_member(&member, b) as usize;
1305 i += 1;
1306 }
1307 }
1308 writer.write_all(&buf[..wp])?;
1309 }
1310 Ok(())
1311}
1312
1313fn delete_single_streaming(
1314 ch: u8,
1315 reader: &mut impl Read,
1316 writer: &mut impl Write,
1317) -> io::Result<()> {
1318 let mut src = vec![0u8; STREAM_BUF];
1319 let mut dst = alloc_uninit_vec(STREAM_BUF);
1320 loop {
1321 let n = read_full(reader, &mut src)?;
1322 if n == 0 {
1323 break;
1324 }
1325 let mut wp = 0;
1329 let mut i = 0;
1330 while i < n {
1331 match memchr::memchr(ch, &src[i..n]) {
1332 Some(offset) => {
1333 if offset > 0 {
1334 unsafe {
1335 std::ptr::copy_nonoverlapping(
1336 src.as_ptr().add(i),
1337 dst.as_mut_ptr().add(wp),
1338 offset,
1339 );
1340 }
1341 wp += offset;
1342 }
1343 i += offset + 1;
1344 }
1345 None => {
1346 let run_len = n - i;
1347 if run_len > 0 {
1348 unsafe {
1349 std::ptr::copy_nonoverlapping(
1350 src.as_ptr().add(i),
1351 dst.as_mut_ptr().add(wp),
1352 run_len,
1353 );
1354 }
1355 wp += run_len;
1356 }
1357 break;
1358 }
1359 }
1360 }
1361 if wp == n {
1363 writer.write_all(&src[..n])?;
1364 } else if wp > 0 {
1365 writer.write_all(&dst[..wp])?;
1366 }
1367 }
1368 Ok(())
1369}
1370
1371fn delete_multi_streaming(
1372 chars: &[u8],
1373 reader: &mut impl Read,
1374 writer: &mut impl Write,
1375) -> io::Result<()> {
1376 let mut src = vec![0u8; STREAM_BUF];
1377 let mut dst = alloc_uninit_vec(STREAM_BUF);
1378 loop {
1379 let n = read_full(reader, &mut src)?;
1380 if n == 0 {
1381 break;
1382 }
1383 let mut wp = 0;
1386 let mut i = 0;
1387 while i < n {
1388 let found = if chars.len() == 2 {
1389 memchr::memchr2(chars[0], chars[1], &src[i..n])
1390 } else {
1391 memchr::memchr3(chars[0], chars[1], chars[2], &src[i..n])
1392 };
1393 match found {
1394 Some(offset) => {
1395 if offset > 0 {
1396 unsafe {
1397 std::ptr::copy_nonoverlapping(
1398 src.as_ptr().add(i),
1399 dst.as_mut_ptr().add(wp),
1400 offset,
1401 );
1402 }
1403 wp += offset;
1404 }
1405 i += offset + 1;
1406 }
1407 None => {
1408 let run_len = n - i;
1409 if run_len > 0 {
1410 unsafe {
1411 std::ptr::copy_nonoverlapping(
1412 src.as_ptr().add(i),
1413 dst.as_mut_ptr().add(wp),
1414 run_len,
1415 );
1416 }
1417 wp += run_len;
1418 }
1419 break;
1420 }
1421 }
1422 }
1423 if wp == n {
1424 writer.write_all(&src[..n])?;
1425 } else if wp > 0 {
1426 writer.write_all(&dst[..wp])?;
1427 }
1428 }
1429 Ok(())
1430}
1431
1432pub fn delete_squeeze(
1433 delete_chars: &[u8],
1434 squeeze_chars: &[u8],
1435 reader: &mut impl Read,
1436 writer: &mut impl Write,
1437) -> io::Result<()> {
1438 let delete_set = build_member_set(delete_chars);
1439 let squeeze_set = build_member_set(squeeze_chars);
1440 let mut buf = vec![0u8; STREAM_BUF];
1441 let mut last_squeezed: u16 = 256;
1442
1443 loop {
1444 let n = read_full(reader, &mut buf)?;
1445 if n == 0 {
1446 break;
1447 }
1448 let mut wp = 0;
1449 unsafe {
1450 let ptr = buf.as_mut_ptr();
1451 for i in 0..n {
1452 let b = *ptr.add(i);
1453 if is_member(&delete_set, b) {
1454 continue;
1455 }
1456 if is_member(&squeeze_set, b) {
1457 if last_squeezed == b as u16 {
1458 continue;
1459 }
1460 last_squeezed = b as u16;
1461 } else {
1462 last_squeezed = 256;
1463 }
1464 *ptr.add(wp) = b;
1465 wp += 1;
1466 }
1467 }
1468 writer.write_all(&buf[..wp])?;
1469 }
1470 Ok(())
1471}
1472
1473pub fn squeeze(
1474 squeeze_chars: &[u8],
1475 reader: &mut impl Read,
1476 writer: &mut impl Write,
1477) -> io::Result<()> {
1478 if squeeze_chars.len() == 1 {
1479 return squeeze_single_stream(squeeze_chars[0], reader, writer);
1480 }
1481
1482 let member = build_member_set(squeeze_chars);
1483 let mut buf = vec![0u8; STREAM_BUF];
1484 let mut last_squeezed: u16 = 256;
1485
1486 loop {
1487 let n = read_full(reader, &mut buf)?;
1488 if n == 0 {
1489 break;
1490 }
1491 let mut wp = 0;
1492 unsafe {
1493 let ptr = buf.as_mut_ptr();
1494 for i in 0..n {
1495 let b = *ptr.add(i);
1496 if is_member(&member, b) {
1497 if last_squeezed == b as u16 {
1498 continue;
1499 }
1500 last_squeezed = b as u16;
1501 } else {
1502 last_squeezed = 256;
1503 }
1504 *ptr.add(wp) = b;
1505 wp += 1;
1506 }
1507 }
1508 writer.write_all(&buf[..wp])?;
1509 }
1510 Ok(())
1511}
1512
1513fn squeeze_single_stream(
1514 ch: u8,
1515 reader: &mut impl Read,
1516 writer: &mut impl Write,
1517) -> io::Result<()> {
1518 let pair = [ch, ch];
1523 let finder = memchr::memmem::Finder::new(&pair);
1524 let mut buf = vec![0u8; STREAM_BUF];
1525 let mut was_squeeze_char = false;
1526
1527 loop {
1528 let n = read_full(reader, &mut buf)?;
1529 if n == 0 {
1530 break;
1531 }
1532
1533 let ptr = buf.as_mut_ptr();
1534 let mut wp = 0;
1535 let mut i = 0;
1536
1537 if was_squeeze_char {
1540 while i < n && unsafe { *ptr.add(i) } == ch {
1541 i += 1;
1542 }
1543 if i >= n {
1544 continue;
1547 }
1548 }
1549
1550 loop {
1553 match finder.find(&buf[i..n]) {
1554 Some(offset) => {
1555 let copy_len = offset + 1; if copy_len > 0 && wp != i {
1558 unsafe {
1559 std::ptr::copy(ptr.add(i), ptr.add(wp), copy_len);
1560 }
1561 }
1562 wp += copy_len;
1563 i += offset + 1; while i < n && unsafe { *ptr.add(i) } == ch {
1566 i += 1;
1567 }
1568 if i >= n {
1569 was_squeeze_char = true;
1570 break;
1571 }
1572 }
1573 None => {
1574 let run_len = n - i;
1576 if run_len > 0 && wp != i {
1577 unsafe {
1578 std::ptr::copy(ptr.add(i), ptr.add(wp), run_len);
1579 }
1580 }
1581 wp += run_len;
1582 was_squeeze_char = n > 0 && unsafe { *ptr.add(n - 1) } == ch;
1584 break;
1585 }
1586 }
1587 }
1588
1589 writer.write_all(&buf[..wp])?;
1590 }
1591 Ok(())
1592}
1593
1594const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
1602
1603pub fn translate_mmap(
1612 set1: &[u8],
1613 set2: &[u8],
1614 data: &[u8],
1615 writer: &mut impl Write,
1616) -> io::Result<()> {
1617 let table = build_translate_table(set1, set2);
1618
1619 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1621 if is_identity {
1622 return writer.write_all(data);
1623 }
1624
1625 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1627 return translate_mmap_range(data, writer, lo, hi, offset);
1628 }
1629
1630 translate_mmap_table(data, writer, &table)
1632}
1633
1634fn translate_mmap_range(
1636 data: &[u8],
1637 writer: &mut impl Write,
1638 lo: u8,
1639 hi: u8,
1640 offset: i8,
1641) -> io::Result<()> {
1642 if data.len() >= PARALLEL_THRESHOLD {
1644 let mut buf = alloc_uninit_vec(data.len());
1645 let n_threads = rayon::current_num_threads().max(1);
1646 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1647
1648 data.par_chunks(chunk_size)
1650 .zip(buf.par_chunks_mut(chunk_size))
1651 .for_each(|(src_chunk, dst_chunk)| {
1652 translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
1653 });
1654
1655 return writer.write_all(&buf);
1656 }
1657
1658 if data.len() <= SINGLE_WRITE_LIMIT {
1660 let mut buf = alloc_uninit_vec(data.len());
1661 translate_range_simd(data, &mut buf, lo, hi, offset);
1662 return writer.write_all(&buf);
1663 }
1664 let mut buf = alloc_uninit_vec(BUF_SIZE);
1666 for chunk in data.chunks(BUF_SIZE) {
1667 translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
1668 writer.write_all(&buf[..chunk.len()])?;
1669 }
1670 Ok(())
1671}
1672
1673fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
1675 if data.len() >= PARALLEL_THRESHOLD {
1677 let mut buf = alloc_uninit_vec(data.len());
1678 let n_threads = rayon::current_num_threads().max(1);
1679 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1680
1681 data.par_chunks(chunk_size)
1682 .zip(buf.par_chunks_mut(chunk_size))
1683 .for_each(|(src_chunk, dst_chunk)| {
1684 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
1685 });
1686
1687 return writer.write_all(&buf);
1688 }
1689
1690 if data.len() <= SINGLE_WRITE_LIMIT {
1692 let mut buf = alloc_uninit_vec(data.len());
1693 translate_to(data, &mut buf, table);
1694 return writer.write_all(&buf);
1695 }
1696 let mut buf = alloc_uninit_vec(BUF_SIZE);
1697 for chunk in data.chunks(BUF_SIZE) {
1698 translate_to(chunk, &mut buf[..chunk.len()], table);
1699 writer.write_all(&buf[..chunk.len()])?;
1700 }
1701 Ok(())
1702}
1703
1704pub fn translate_squeeze_mmap(
1710 set1: &[u8],
1711 set2: &[u8],
1712 data: &[u8],
1713 writer: &mut impl Write,
1714) -> io::Result<()> {
1715 let table = build_translate_table(set1, set2);
1716 let squeeze_set = build_member_set(set2);
1717
1718 if data.len() >= PARALLEL_THRESHOLD {
1723 let mut translated = alloc_uninit_vec(data.len());
1725 let range_info = detect_range_offset(&table);
1726 let n_threads = rayon::current_num_threads().max(1);
1727 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1728
1729 if let Some((lo, hi, offset)) = range_info {
1730 data.par_chunks(chunk_size)
1731 .zip(translated.par_chunks_mut(chunk_size))
1732 .for_each(|(src_chunk, dst_chunk)| {
1733 translate_range_simd(
1734 src_chunk,
1735 &mut dst_chunk[..src_chunk.len()],
1736 lo,
1737 hi,
1738 offset,
1739 );
1740 });
1741 } else {
1742 data.par_chunks(chunk_size)
1743 .zip(translated.par_chunks_mut(chunk_size))
1744 .for_each(|(src_chunk, dst_chunk)| {
1745 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
1746 });
1747 }
1748
1749 let mut last_squeezed: u16 = 256;
1753 let len = translated.len();
1754 let mut wp = 0;
1755 unsafe {
1756 let ptr = translated.as_mut_ptr();
1757 let mut i = 0;
1758 while i < len {
1759 let b = *ptr.add(i);
1760 if is_member(&squeeze_set, b) {
1761 if last_squeezed == b as u16 {
1762 i += 1;
1763 continue;
1764 }
1765 last_squeezed = b as u16;
1766 } else {
1767 last_squeezed = 256;
1768 }
1769 *ptr.add(wp) = b;
1770 wp += 1;
1771 i += 1;
1772 }
1773 }
1774 return writer.write_all(&translated[..wp]);
1775 }
1776
1777 if data.len() <= SINGLE_WRITE_LIMIT {
1779 let mut buf: Vec<u8> = Vec::with_capacity(data.len());
1780 let mut last_squeezed: u16 = 256;
1781 unsafe {
1782 buf.set_len(data.len());
1783 let outp: *mut u8 = buf.as_mut_ptr();
1784 let inp = data.as_ptr();
1785 let len = data.len();
1786 let mut wp = 0;
1787 let mut i = 0;
1788 while i < len {
1789 let translated = *table.get_unchecked(*inp.add(i) as usize);
1790 if is_member(&squeeze_set, translated) {
1791 if last_squeezed == translated as u16 {
1792 i += 1;
1793 continue;
1794 }
1795 last_squeezed = translated as u16;
1796 } else {
1797 last_squeezed = 256;
1798 }
1799 *outp.add(wp) = translated;
1800 wp += 1;
1801 i += 1;
1802 }
1803 buf.set_len(wp);
1804 }
1805 return writer.write_all(&buf);
1806 }
1807
1808 let buf_size = data.len().min(BUF_SIZE);
1810 let mut buf = vec![0u8; buf_size];
1811 let mut last_squeezed: u16 = 256;
1812
1813 for chunk in data.chunks(buf_size) {
1814 translate_to(chunk, &mut buf[..chunk.len()], &table);
1815 let mut wp = 0;
1816 unsafe {
1817 let ptr = buf.as_mut_ptr();
1818 for i in 0..chunk.len() {
1819 let b = *ptr.add(i);
1820 if is_member(&squeeze_set, b) {
1821 if last_squeezed == b as u16 {
1822 continue;
1823 }
1824 last_squeezed = b as u16;
1825 } else {
1826 last_squeezed = 256;
1827 }
1828 *ptr.add(wp) = b;
1829 wp += 1;
1830 }
1831 }
1832 writer.write_all(&buf[..wp])?;
1833 }
1834 Ok(())
1835}
1836
1837pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1843 if delete_chars.len() == 1 {
1844 return delete_single_char_mmap(delete_chars[0], data, writer);
1845 }
1846 if delete_chars.len() <= 3 {
1847 return delete_multi_memchr_mmap(delete_chars, data, writer);
1848 }
1849
1850 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1852 return delete_range_mmap(data, writer, lo, hi);
1853 }
1854
1855 let member = build_member_set(delete_chars);
1856
1857 if data.len() >= PARALLEL_THRESHOLD {
1861 let n_threads = rayon::current_num_threads().max(1);
1862 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1863
1864 let mut outbuf = alloc_uninit_vec(data.len());
1866 let chunk_lens: Vec<usize> = data
1867 .par_chunks(chunk_size)
1868 .zip(outbuf.par_chunks_mut(chunk_size))
1869 .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
1870 .collect();
1871
1872 let mut write_pos = 0;
1876 let mut src_offset = 0;
1877 for &clen in &chunk_lens {
1878 if clen > 0 && src_offset != write_pos {
1879 unsafe {
1880 std::ptr::copy(
1881 outbuf.as_ptr().add(src_offset),
1882 outbuf.as_mut_ptr().add(write_pos),
1883 clen,
1884 );
1885 }
1886 }
1887 write_pos += clen;
1888 src_offset += chunk_size;
1889 }
1890
1891 return writer.write_all(&outbuf[..write_pos]);
1892 }
1893
1894 if data.len() <= SINGLE_WRITE_LIMIT {
1896 let mut outbuf = alloc_uninit_vec(data.len());
1897 let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
1898 return writer.write_all(&outbuf[..out_pos]);
1899 }
1900
1901 let buf_size = data.len().min(BUF_SIZE);
1903 let mut outbuf = alloc_uninit_vec(buf_size);
1904
1905 for chunk in data.chunks(buf_size) {
1906 let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
1907 writer.write_all(&outbuf[..out_pos])?;
1908 }
1909 Ok(())
1910}
1911
1912fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
1914 if data.len() >= PARALLEL_THRESHOLD {
1916 let n_threads = rayon::current_num_threads().max(1);
1917 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1918
1919 let results: Vec<Vec<u8>> = data
1920 .par_chunks(chunk_size)
1921 .map(|chunk| {
1922 let mut out = alloc_uninit_vec(chunk.len());
1923 let wp = delete_range_chunk(chunk, &mut out, lo, hi);
1924 unsafe { out.set_len(wp) };
1925 out
1926 })
1927 .collect();
1928
1929 let slices: Vec<std::io::IoSlice> = results
1930 .iter()
1931 .filter(|r| !r.is_empty())
1932 .map(|r| std::io::IoSlice::new(r))
1933 .collect();
1934 return write_ioslices(writer, &slices);
1935 }
1936
1937 if data.len() <= SINGLE_WRITE_LIMIT {
1939 let mut outbuf = alloc_uninit_vec(data.len());
1940 let wp = delete_range_chunk(data, &mut outbuf, lo, hi);
1941 return writer.write_all(&outbuf[..wp]);
1942 }
1943
1944 let mut outbuf = alloc_uninit_vec(BUF_SIZE);
1946 for chunk in data.chunks(BUF_SIZE) {
1947 let wp = delete_range_chunk(chunk, &mut outbuf, lo, hi);
1948 writer.write_all(&outbuf[..wp])?;
1949 }
1950 Ok(())
1951}
1952
1953#[inline]
1956fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
1957 let len = chunk.len();
1958 let mut out_pos = 0;
1959 let mut i = 0;
1960
1961 while i + 8 <= len {
1962 unsafe {
1963 let b0 = *chunk.get_unchecked(i);
1964 let b1 = *chunk.get_unchecked(i + 1);
1965 let b2 = *chunk.get_unchecked(i + 2);
1966 let b3 = *chunk.get_unchecked(i + 3);
1967 let b4 = *chunk.get_unchecked(i + 4);
1968 let b5 = *chunk.get_unchecked(i + 5);
1969 let b6 = *chunk.get_unchecked(i + 6);
1970 let b7 = *chunk.get_unchecked(i + 7);
1971
1972 *outbuf.get_unchecked_mut(out_pos) = b0;
1973 out_pos += !is_member(member, b0) as usize;
1974 *outbuf.get_unchecked_mut(out_pos) = b1;
1975 out_pos += !is_member(member, b1) as usize;
1976 *outbuf.get_unchecked_mut(out_pos) = b2;
1977 out_pos += !is_member(member, b2) as usize;
1978 *outbuf.get_unchecked_mut(out_pos) = b3;
1979 out_pos += !is_member(member, b3) as usize;
1980 *outbuf.get_unchecked_mut(out_pos) = b4;
1981 out_pos += !is_member(member, b4) as usize;
1982 *outbuf.get_unchecked_mut(out_pos) = b5;
1983 out_pos += !is_member(member, b5) as usize;
1984 *outbuf.get_unchecked_mut(out_pos) = b6;
1985 out_pos += !is_member(member, b6) as usize;
1986 *outbuf.get_unchecked_mut(out_pos) = b7;
1987 out_pos += !is_member(member, b7) as usize;
1988 }
1989 i += 8;
1990 }
1991
1992 while i < len {
1993 unsafe {
1994 let b = *chunk.get_unchecked(i);
1995 *outbuf.get_unchecked_mut(out_pos) = b;
1996 out_pos += !is_member(member, b) as usize;
1997 }
1998 i += 1;
1999 }
2000
2001 out_pos
2002}
2003
2004fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2005 if data.len() >= PARALLEL_THRESHOLD {
2008 let n_threads = rayon::current_num_threads().max(1);
2009 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2010
2011 let results: Vec<Vec<u8>> = data
2012 .par_chunks(chunk_size)
2013 .map(|chunk| {
2014 let mut out = Vec::with_capacity(chunk.len());
2015 let mut last = 0;
2016 for pos in memchr::memchr_iter(ch, chunk) {
2017 if pos > last {
2018 out.extend_from_slice(&chunk[last..pos]);
2019 }
2020 last = pos + 1;
2021 }
2022 if last < chunk.len() {
2023 out.extend_from_slice(&chunk[last..]);
2024 }
2025 out
2026 })
2027 .collect();
2028
2029 let slices: Vec<std::io::IoSlice> = results
2031 .iter()
2032 .filter(|r| !r.is_empty())
2033 .map(|r| std::io::IoSlice::new(r))
2034 .collect();
2035 return write_ioslices(writer, &slices);
2036 }
2037
2038 if data.len() <= SINGLE_WRITE_LIMIT {
2040 let mut outbuf = Vec::with_capacity(data.len());
2041 let mut last = 0;
2042 for pos in memchr::memchr_iter(ch, data) {
2043 if pos > last {
2044 outbuf.extend_from_slice(&data[last..pos]);
2045 }
2046 last = pos + 1;
2047 }
2048 if last < data.len() {
2049 outbuf.extend_from_slice(&data[last..]);
2050 }
2051 return writer.write_all(&outbuf);
2052 }
2053
2054 let buf_size = data.len().min(BUF_SIZE);
2056 let mut outbuf = vec![0u8; buf_size];
2057
2058 for chunk in data.chunks(buf_size) {
2059 let mut wp = 0;
2060 let mut last = 0;
2061 for pos in memchr::memchr_iter(ch, chunk) {
2062 if pos > last {
2063 let run = pos - last;
2064 outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2065 wp += run;
2066 }
2067 last = pos + 1;
2068 }
2069 if last < chunk.len() {
2070 let run = chunk.len() - last;
2071 outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2072 wp += run;
2073 }
2074 writer.write_all(&outbuf[..wp])?;
2075 }
2076 Ok(())
2077}
2078
2079fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2080 let c0 = chars[0];
2081 let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
2082 let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
2083 let is_three = chars.len() >= 3;
2084
2085 if data.len() >= PARALLEL_THRESHOLD {
2087 let n_threads = rayon::current_num_threads().max(1);
2088 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2089
2090 let results: Vec<Vec<u8>> = data
2091 .par_chunks(chunk_size)
2092 .map(|chunk| {
2093 let mut out = Vec::with_capacity(chunk.len());
2094 let mut last = 0;
2095 if is_three {
2096 for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2097 if pos > last {
2098 out.extend_from_slice(&chunk[last..pos]);
2099 }
2100 last = pos + 1;
2101 }
2102 } else {
2103 for pos in memchr::memchr2_iter(c0, c1, chunk) {
2104 if pos > last {
2105 out.extend_from_slice(&chunk[last..pos]);
2106 }
2107 last = pos + 1;
2108 }
2109 }
2110 if last < chunk.len() {
2111 out.extend_from_slice(&chunk[last..]);
2112 }
2113 out
2114 })
2115 .collect();
2116
2117 let slices: Vec<std::io::IoSlice> = results
2119 .iter()
2120 .filter(|r| !r.is_empty())
2121 .map(|r| std::io::IoSlice::new(r))
2122 .collect();
2123 return write_ioslices(writer, &slices);
2124 }
2125
2126 if data.len() <= SINGLE_WRITE_LIMIT {
2128 let mut outbuf = Vec::with_capacity(data.len());
2129 let mut last = 0;
2130 if is_three {
2131 for pos in memchr::memchr3_iter(c0, c1, c2, data) {
2132 if pos > last {
2133 outbuf.extend_from_slice(&data[last..pos]);
2134 }
2135 last = pos + 1;
2136 }
2137 } else {
2138 for pos in memchr::memchr2_iter(c0, c1, data) {
2139 if pos > last {
2140 outbuf.extend_from_slice(&data[last..pos]);
2141 }
2142 last = pos + 1;
2143 }
2144 }
2145 if last < data.len() {
2146 outbuf.extend_from_slice(&data[last..]);
2147 }
2148 return writer.write_all(&outbuf);
2149 }
2150
2151 let buf_size = data.len().min(BUF_SIZE);
2153 let mut outbuf = vec![0u8; buf_size];
2154
2155 for chunk in data.chunks(buf_size) {
2156 let mut wp = 0;
2157 let mut last = 0;
2158
2159 if is_three {
2162 for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2163 if pos > last {
2164 let run = pos - last;
2165 outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2166 wp += run;
2167 }
2168 last = pos + 1;
2169 }
2170 } else {
2171 for pos in memchr::memchr2_iter(c0, c1, chunk) {
2172 if pos > last {
2173 let run = pos - last;
2174 outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2175 wp += run;
2176 }
2177 last = pos + 1;
2178 }
2179 }
2180
2181 if last < chunk.len() {
2182 let run = chunk.len() - last;
2183 outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2184 wp += run;
2185 }
2186 writer.write_all(&outbuf[..wp])?;
2187 }
2188 Ok(())
2189}
2190
2191pub fn delete_squeeze_mmap(
2196 delete_chars: &[u8],
2197 squeeze_chars: &[u8],
2198 data: &[u8],
2199 writer: &mut impl Write,
2200) -> io::Result<()> {
2201 let delete_set = build_member_set(delete_chars);
2202 let squeeze_set = build_member_set(squeeze_chars);
2203
2204 if data.len() <= SINGLE_WRITE_LIMIT {
2206 let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2207 let mut last_squeezed: u16 = 256;
2208 unsafe {
2209 outbuf.set_len(data.len());
2210 let outp: *mut u8 = outbuf.as_mut_ptr();
2211 let inp = data.as_ptr();
2212 let len = data.len();
2213 let mut out_pos = 0;
2214 let mut i = 0;
2215 while i < len {
2216 let b = *inp.add(i);
2217 if is_member(&delete_set, b) {
2218 i += 1;
2219 continue;
2220 }
2221 if is_member(&squeeze_set, b) {
2222 if last_squeezed == b as u16 {
2223 i += 1;
2224 continue;
2225 }
2226 last_squeezed = b as u16;
2227 } else {
2228 last_squeezed = 256;
2229 }
2230 *outp.add(out_pos) = b;
2231 out_pos += 1;
2232 i += 1;
2233 }
2234 outbuf.set_len(out_pos);
2235 }
2236 return writer.write_all(&outbuf);
2237 }
2238
2239 let buf_size = data.len().min(BUF_SIZE);
2241 let mut outbuf = vec![0u8; buf_size];
2242 let mut last_squeezed: u16 = 256;
2243
2244 for chunk in data.chunks(buf_size) {
2245 let mut out_pos = 0;
2246 for &b in chunk {
2247 if is_member(&delete_set, b) {
2248 continue;
2249 }
2250 if is_member(&squeeze_set, b) {
2251 if last_squeezed == b as u16 {
2252 continue;
2253 }
2254 last_squeezed = b as u16;
2255 } else {
2256 last_squeezed = 256;
2257 }
2258 unsafe {
2259 *outbuf.get_unchecked_mut(out_pos) = b;
2260 }
2261 out_pos += 1;
2262 }
2263 writer.write_all(&outbuf[..out_pos])?;
2264 }
2265 Ok(())
2266}
2267
2268pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2274 if squeeze_chars.len() == 1 {
2275 return squeeze_single_mmap(squeeze_chars[0], data, writer);
2276 }
2277 if squeeze_chars.len() == 2 {
2278 return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
2279 }
2280 if squeeze_chars.len() == 3 {
2281 return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
2282 }
2283
2284 let member = build_member_set(squeeze_chars);
2285
2286 if data.len() >= PARALLEL_THRESHOLD {
2288 let n_threads = rayon::current_num_threads().max(1);
2289 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2290
2291 let results: Vec<Vec<u8>> = data
2292 .par_chunks(chunk_size)
2293 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2294 .collect();
2295
2296 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2301 for (idx, result) in results.iter().enumerate() {
2302 if result.is_empty() {
2303 continue;
2304 }
2305 if idx > 0 {
2306 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2308 if is_member(&member, prev_last) {
2309 let skip = result.iter().take_while(|&&b| b == prev_last).count();
2311 if skip < result.len() {
2312 slices.push(std::io::IoSlice::new(&result[skip..]));
2313 }
2314 continue;
2315 }
2316 }
2317 }
2318 slices.push(std::io::IoSlice::new(result));
2319 }
2320 return write_ioslices(writer, &slices);
2321 }
2322
2323 if data.len() <= SINGLE_WRITE_LIMIT {
2325 let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2326 let mut last_squeezed: u16 = 256;
2327 let len = data.len();
2328 let mut wp = 0;
2329 let mut i = 0;
2330
2331 unsafe {
2332 outbuf.set_len(data.len());
2333 let inp = data.as_ptr();
2334 let outp: *mut u8 = outbuf.as_mut_ptr();
2335
2336 while i < len {
2337 let b = *inp.add(i);
2338 if is_member(&member, b) {
2339 if last_squeezed != b as u16 {
2340 *outp.add(wp) = b;
2341 wp += 1;
2342 last_squeezed = b as u16;
2343 }
2344 i += 1;
2345 while i < len && *inp.add(i) == b {
2346 i += 1;
2347 }
2348 } else {
2349 last_squeezed = 256;
2350 *outp.add(wp) = b;
2351 wp += 1;
2352 i += 1;
2353 }
2354 }
2355 outbuf.set_len(wp);
2356 }
2357 return writer.write_all(&outbuf);
2358 }
2359
2360 let buf_size = data.len().min(BUF_SIZE);
2362 let mut outbuf = vec![0u8; buf_size];
2363 let mut last_squeezed: u16 = 256;
2364
2365 for chunk in data.chunks(buf_size) {
2366 let len = chunk.len();
2367 let mut wp = 0;
2368 let mut i = 0;
2369
2370 unsafe {
2371 let inp = chunk.as_ptr();
2372 let outp = outbuf.as_mut_ptr();
2373
2374 while i < len {
2375 let b = *inp.add(i);
2376 if is_member(&member, b) {
2377 if last_squeezed != b as u16 {
2378 *outp.add(wp) = b;
2379 wp += 1;
2380 last_squeezed = b as u16;
2381 }
2382 i += 1;
2383 while i < len && *inp.add(i) == b {
2384 i += 1;
2385 }
2386 } else {
2387 last_squeezed = 256;
2388 *outp.add(wp) = b;
2389 wp += 1;
2390 i += 1;
2391 }
2392 }
2393 }
2394 writer.write_all(&outbuf[..wp])?;
2395 }
2396 Ok(())
2397}
2398
2399fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
2401 let len = chunk.len();
2402 let mut out = Vec::with_capacity(len);
2403 let mut last_squeezed: u16 = 256;
2404 let mut i = 0;
2405
2406 unsafe {
2407 out.set_len(len);
2408 let inp = chunk.as_ptr();
2409 let outp: *mut u8 = out.as_mut_ptr();
2410 let mut wp = 0;
2411
2412 while i < len {
2413 let b = *inp.add(i);
2414 if is_member(member, b) {
2415 if last_squeezed != b as u16 {
2416 *outp.add(wp) = b;
2417 wp += 1;
2418 last_squeezed = b as u16;
2419 }
2420 i += 1;
2421 while i < len && *inp.add(i) == b {
2422 i += 1;
2423 }
2424 } else {
2425 last_squeezed = 256;
2426 *outp.add(wp) = b;
2427 wp += 1;
2428 i += 1;
2429 }
2430 }
2431 out.set_len(wp);
2432 }
2433 out
2434}
2435
2436fn squeeze_multi_mmap<const N: usize>(
2437 chars: &[u8],
2438 data: &[u8],
2439 writer: &mut impl Write,
2440) -> io::Result<()> {
2441 if data.len() >= PARALLEL_THRESHOLD {
2443 let member = build_member_set(chars);
2444 let n_threads = rayon::current_num_threads().max(1);
2445 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2446
2447 let results: Vec<Vec<u8>> = data
2448 .par_chunks(chunk_size)
2449 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2450 .collect();
2451
2452 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2454 for (idx, result) in results.iter().enumerate() {
2455 if result.is_empty() {
2456 continue;
2457 }
2458 if idx > 0 {
2459 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2460 if is_member(&member, prev_last) {
2461 let skip = result.iter().take_while(|&&b| b == prev_last).count();
2462 if skip < result.len() {
2463 slices.push(std::io::IoSlice::new(&result[skip..]));
2464 }
2465 continue;
2466 }
2467 }
2468 }
2469 slices.push(std::io::IoSlice::new(result));
2470 }
2471 return write_ioslices(writer, &slices);
2472 }
2473
2474 let buf_size = data.len().min(BUF_SIZE);
2475 let mut outbuf = vec![0u8; buf_size];
2476 let mut wp = 0;
2477 let mut last_squeezed: u16 = 256;
2478 let mut cursor = 0;
2479
2480 macro_rules! find_next {
2481 ($data:expr) => {
2482 if N == 2 {
2483 memchr::memchr2(chars[0], chars[1], $data)
2484 } else {
2485 memchr::memchr3(chars[0], chars[1], chars[2], $data)
2486 }
2487 };
2488 }
2489
2490 macro_rules! flush_and_copy {
2491 ($src:expr, $len:expr) => {
2492 if wp + $len > buf_size {
2493 writer.write_all(&outbuf[..wp])?;
2494 wp = 0;
2495 }
2496 if $len > buf_size {
2497 writer.write_all($src)?;
2498 } else {
2499 outbuf[wp..wp + $len].copy_from_slice($src);
2500 wp += $len;
2501 }
2502 };
2503 }
2504
2505 while cursor < data.len() {
2506 match find_next!(&data[cursor..]) {
2507 Some(offset) => {
2508 let pos = cursor + offset;
2509 let b = data[pos];
2510 if pos > cursor {
2511 let span = pos - cursor;
2512 flush_and_copy!(&data[cursor..pos], span);
2513 last_squeezed = 256;
2514 }
2515 if last_squeezed != b as u16 {
2516 if wp >= buf_size {
2517 writer.write_all(&outbuf[..wp])?;
2518 wp = 0;
2519 }
2520 outbuf[wp] = b;
2521 wp += 1;
2522 last_squeezed = b as u16;
2523 }
2524 let mut skip = pos + 1;
2525 while skip < data.len() && data[skip] == b {
2526 skip += 1;
2527 }
2528 cursor = skip;
2529 }
2530 None => {
2531 let remaining = data.len() - cursor;
2532 flush_and_copy!(&data[cursor..], remaining);
2533 break;
2534 }
2535 }
2536 }
2537 if wp > 0 {
2538 writer.write_all(&outbuf[..wp])?;
2539 }
2540 Ok(())
2541}
2542
2543fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2544 if data.is_empty() {
2545 return Ok(());
2546 }
2547
2548 if memchr::memmem::find(data, &[ch, ch]).is_none() {
2549 return writer.write_all(data);
2550 }
2551
2552 if data.len() >= PARALLEL_THRESHOLD {
2554 let n_threads = rayon::current_num_threads().max(1);
2555 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2556
2557 let results: Vec<Vec<u8>> = data
2558 .par_chunks(chunk_size)
2559 .map(|chunk| {
2560 let mut out = Vec::with_capacity(chunk.len());
2561 let mut cursor = 0;
2562 while cursor < chunk.len() {
2563 match memchr::memchr(ch, &chunk[cursor..]) {
2564 Some(offset) => {
2565 let pos = cursor + offset;
2566 if pos > cursor {
2567 out.extend_from_slice(&chunk[cursor..pos]);
2568 }
2569 out.push(ch);
2570 cursor = pos + 1;
2571 while cursor < chunk.len() && chunk[cursor] == ch {
2572 cursor += 1;
2573 }
2574 }
2575 None => {
2576 out.extend_from_slice(&chunk[cursor..]);
2577 break;
2578 }
2579 }
2580 }
2581 out
2582 })
2583 .collect();
2584
2585 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2588 for (idx, result) in results.iter().enumerate() {
2589 if result.is_empty() {
2590 continue;
2591 }
2592 if idx > 0 {
2593 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2594 if prev_last == ch {
2595 let skip = result.iter().take_while(|&&b| b == ch).count();
2597 if skip < result.len() {
2598 slices.push(std::io::IoSlice::new(&result[skip..]));
2599 }
2600 continue;
2601 }
2602 }
2603 }
2604 slices.push(std::io::IoSlice::new(result));
2605 }
2606 return write_ioslices(writer, &slices);
2607 }
2608
2609 let buf_size = data.len().min(BUF_SIZE);
2610 let mut outbuf = vec![0u8; buf_size];
2611 let len = data.len();
2612 let mut wp = 0;
2613 let mut cursor = 0;
2614
2615 while cursor < len {
2616 match memchr::memchr(ch, &data[cursor..]) {
2617 Some(offset) => {
2618 let pos = cursor + offset;
2619 let gap = pos - cursor;
2620 if gap > 0 {
2621 if wp + gap > buf_size {
2622 writer.write_all(&outbuf[..wp])?;
2623 wp = 0;
2624 }
2625 if gap > buf_size {
2626 writer.write_all(&data[cursor..pos])?;
2627 } else {
2628 outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
2629 wp += gap;
2630 }
2631 }
2632 if wp >= buf_size {
2633 writer.write_all(&outbuf[..wp])?;
2634 wp = 0;
2635 }
2636 outbuf[wp] = ch;
2637 wp += 1;
2638 cursor = pos + 1;
2639 while cursor < len && data[cursor] == ch {
2640 cursor += 1;
2641 }
2642 }
2643 None => {
2644 let remaining = len - cursor;
2645 if remaining > 0 {
2646 if wp + remaining > buf_size {
2647 writer.write_all(&outbuf[..wp])?;
2648 wp = 0;
2649 }
2650 if remaining > buf_size {
2651 writer.write_all(&data[cursor..])?;
2652 } else {
2653 outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
2654 wp += remaining;
2655 }
2656 }
2657 break;
2658 }
2659 }
2660 }
2661
2662 if wp > 0 {
2663 writer.write_all(&outbuf[..wp])?;
2664 }
2665 Ok(())
2666}