1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const MAX_IOV: usize = 1024;
8
9const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12const STREAM_BUF: usize = 4 * 1024 * 1024;
18
19const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
22
23#[inline]
26fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
27 if slices.is_empty() {
28 return Ok(());
29 }
30 for batch in slices.chunks(MAX_IOV) {
31 let total: usize = batch.iter().map(|s| s.len()).sum();
32 match writer.write_vectored(batch) {
33 Ok(n) if n >= total => continue,
34 Ok(mut written) => {
35 for slice in batch {
37 let slen = slice.len();
38 if written >= slen {
39 written -= slen;
40 continue;
41 }
42 if written > 0 {
43 writer.write_all(&slice[written..])?;
44 written = 0;
45 } else {
46 writer.write_all(slice)?;
47 }
48 }
49 }
50 Err(e) => return Err(e),
51 }
52 }
53 Ok(())
54}
55
56#[inline]
59#[allow(clippy::uninit_vec)]
60fn alloc_uninit_vec(len: usize) -> Vec<u8> {
61 let mut v = Vec::with_capacity(len);
62 unsafe {
64 v.set_len(len);
65 }
66 v
67}
68
69#[inline]
71fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
72 let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
73 let last = set2.last().copied();
74 for (i, &from) in set1.iter().enumerate() {
75 table[from as usize] = if i < set2.len() {
76 set2[i]
77 } else {
78 last.unwrap_or(from)
79 };
80 }
81 table
82}
83
84#[inline]
86fn build_member_set(chars: &[u8]) -> [u8; 32] {
87 let mut set = [0u8; 32];
88 for &ch in chars {
89 set[ch as usize >> 3] |= 1 << (ch & 7);
90 }
91 set
92}
93
94#[inline(always)]
95fn is_member(set: &[u8; 32], ch: u8) -> bool {
96 unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
97}
98
99#[inline(always)]
103fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
104 #[cfg(target_arch = "x86_64")]
105 {
106 if is_x86_feature_detected!("avx2") {
107 unsafe { translate_inplace_avx2_table(data, table) };
108 return;
109 }
110 if is_x86_feature_detected!("ssse3") {
111 unsafe { translate_inplace_ssse3_table(data, table) };
112 return;
113 }
114 }
115 translate_inplace_scalar(data, table);
116}
117
118#[inline(always)]
120fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
121 let len = data.len();
122 let ptr = data.as_mut_ptr();
123 let mut i = 0;
124 unsafe {
125 while i + 8 <= len {
126 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
127 *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
128 *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
129 *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
130 *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
131 *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
132 *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
133 *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
134 i += 8;
135 }
136 while i < len {
137 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
138 i += 1;
139 }
140 }
141}
142
143#[cfg(target_arch = "x86_64")]
163#[target_feature(enable = "avx2")]
164unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
165 use std::arch::x86_64::*;
166
167 unsafe {
168 let len = data.len();
169 let ptr = data.as_mut_ptr();
170
171 let mut lut = [_mm256_setzero_si256(); 16];
175 for h in 0u8..16 {
176 let base = (h as usize) * 16;
177 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
178 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
180 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
181 }
182
183 let lo_mask = _mm256_set1_epi8(0x0F);
184
185 let mut i = 0;
186 while i + 32 <= len {
187 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
188 let lo_nibble = _mm256_and_si256(input, lo_mask);
189 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
190
191 let mut result = _mm256_setzero_si256();
194
195 macro_rules! do_nibble {
197 ($h:expr) => {
198 let h_val = _mm256_set1_epi8($h as i8);
199 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
200 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
201 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
202 };
203 }
204 do_nibble!(0);
205 do_nibble!(1);
206 do_nibble!(2);
207 do_nibble!(3);
208 do_nibble!(4);
209 do_nibble!(5);
210 do_nibble!(6);
211 do_nibble!(7);
212 do_nibble!(8);
213 do_nibble!(9);
214 do_nibble!(10);
215 do_nibble!(11);
216 do_nibble!(12);
217 do_nibble!(13);
218 do_nibble!(14);
219 do_nibble!(15);
220
221 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
222 i += 32;
223 }
224
225 if i + 16 <= len {
227 let lo_mask128 = _mm_set1_epi8(0x0F);
228
229 let mut lut128 = [_mm_setzero_si128(); 16];
231 for h in 0u8..16 {
232 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
233 }
234
235 let input = _mm_loadu_si128(ptr.add(i) as *const _);
236 let lo_nib = _mm_and_si128(input, lo_mask128);
237 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
238
239 let mut res = _mm_setzero_si128();
240 macro_rules! do_nibble128 {
241 ($h:expr) => {
242 let h_val = _mm_set1_epi8($h as i8);
243 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
244 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
245 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
246 };
247 }
248 do_nibble128!(0);
249 do_nibble128!(1);
250 do_nibble128!(2);
251 do_nibble128!(3);
252 do_nibble128!(4);
253 do_nibble128!(5);
254 do_nibble128!(6);
255 do_nibble128!(7);
256 do_nibble128!(8);
257 do_nibble128!(9);
258 do_nibble128!(10);
259 do_nibble128!(11);
260 do_nibble128!(12);
261 do_nibble128!(13);
262 do_nibble128!(14);
263 do_nibble128!(15);
264
265 _mm_storeu_si128(ptr.add(i) as *mut _, res);
266 i += 16;
267 }
268
269 while i < len {
271 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
272 i += 1;
273 }
274 }
275}
276
277#[cfg(target_arch = "x86_64")]
278#[target_feature(enable = "ssse3")]
279unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
280 use std::arch::x86_64::*;
281
282 unsafe {
283 let len = data.len();
284 let ptr = data.as_mut_ptr();
285
286 let mut lut = [_mm_setzero_si128(); 16];
288 for h in 0u8..16 {
289 let base = (h as usize) * 16;
290 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
291 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
292 }
293
294 let lo_mask = _mm_set1_epi8(0x0F);
295
296 let mut i = 0;
297 while i + 16 <= len {
298 let input = _mm_loadu_si128(ptr.add(i) as *const _);
299 let lo_nibble = _mm_and_si128(input, lo_mask);
300 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
301
302 let mut result = _mm_setzero_si128();
303
304 macro_rules! do_nibble {
305 ($h:expr) => {
306 let h_val = _mm_set1_epi8($h as i8);
307 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
308 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
309 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
310 };
311 }
312 do_nibble!(0);
313 do_nibble!(1);
314 do_nibble!(2);
315 do_nibble!(3);
316 do_nibble!(4);
317 do_nibble!(5);
318 do_nibble!(6);
319 do_nibble!(7);
320 do_nibble!(8);
321 do_nibble!(9);
322 do_nibble!(10);
323 do_nibble!(11);
324 do_nibble!(12);
325 do_nibble!(13);
326 do_nibble!(14);
327 do_nibble!(15);
328
329 _mm_storeu_si128(ptr.add(i) as *mut _, result);
330 i += 16;
331 }
332
333 while i < len {
335 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
336 i += 1;
337 }
338 }
339}
340
341#[inline(always)]
344fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
345 debug_assert!(dst.len() >= src.len());
346 #[cfg(target_arch = "x86_64")]
347 {
348 if is_x86_feature_detected!("avx2") {
349 unsafe { translate_to_avx2_table(src, dst, table) };
350 return;
351 }
352 if is_x86_feature_detected!("ssse3") {
353 unsafe { translate_to_ssse3_table(src, dst, table) };
354 return;
355 }
356 }
357 translate_to_scalar(src, dst, table);
358}
359
360#[inline(always)]
362fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
363 unsafe {
364 let sp = src.as_ptr();
365 let dp = dst.as_mut_ptr();
366 let len = src.len();
367 let mut i = 0;
368 while i + 8 <= len {
369 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
370 *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
371 *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
372 *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
373 *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
374 *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
375 *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
376 *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
377 i += 8;
378 }
379 while i < len {
380 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
381 i += 1;
382 }
383 }
384}
385
386#[cfg(target_arch = "x86_64")]
387#[target_feature(enable = "avx2")]
388unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
389 use std::arch::x86_64::*;
390
391 unsafe {
392 let len = src.len();
393 let sp = src.as_ptr();
394 let dp = dst.as_mut_ptr();
395
396 let mut lut = [_mm256_setzero_si256(); 16];
398 for h in 0u8..16 {
399 let base = (h as usize) * 16;
400 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
401 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
402 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
403 }
404
405 let lo_mask = _mm256_set1_epi8(0x0F);
406
407 let mut i = 0;
408 while i + 32 <= len {
409 let input = _mm256_loadu_si256(sp.add(i) as *const _);
410 let lo_nibble = _mm256_and_si256(input, lo_mask);
411 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
412
413 let mut result = _mm256_setzero_si256();
414
415 macro_rules! do_nibble {
416 ($h:expr) => {
417 let h_val = _mm256_set1_epi8($h as i8);
418 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
419 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
420 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
421 };
422 }
423 do_nibble!(0);
424 do_nibble!(1);
425 do_nibble!(2);
426 do_nibble!(3);
427 do_nibble!(4);
428 do_nibble!(5);
429 do_nibble!(6);
430 do_nibble!(7);
431 do_nibble!(8);
432 do_nibble!(9);
433 do_nibble!(10);
434 do_nibble!(11);
435 do_nibble!(12);
436 do_nibble!(13);
437 do_nibble!(14);
438 do_nibble!(15);
439
440 _mm256_storeu_si256(dp.add(i) as *mut _, result);
441 i += 32;
442 }
443
444 if i + 16 <= len {
446 let lo_mask128 = _mm_set1_epi8(0x0F);
447 let mut lut128 = [_mm_setzero_si128(); 16];
448 for h in 0u8..16 {
449 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
450 }
451
452 let input = _mm_loadu_si128(sp.add(i) as *const _);
453 let lo_nib = _mm_and_si128(input, lo_mask128);
454 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
455
456 let mut res = _mm_setzero_si128();
457 macro_rules! do_nibble128 {
458 ($h:expr) => {
459 let h_val = _mm_set1_epi8($h as i8);
460 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
461 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
462 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
463 };
464 }
465 do_nibble128!(0);
466 do_nibble128!(1);
467 do_nibble128!(2);
468 do_nibble128!(3);
469 do_nibble128!(4);
470 do_nibble128!(5);
471 do_nibble128!(6);
472 do_nibble128!(7);
473 do_nibble128!(8);
474 do_nibble128!(9);
475 do_nibble128!(10);
476 do_nibble128!(11);
477 do_nibble128!(12);
478 do_nibble128!(13);
479 do_nibble128!(14);
480 do_nibble128!(15);
481
482 _mm_storeu_si128(dp.add(i) as *mut _, res);
483 i += 16;
484 }
485
486 while i < len {
488 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
489 i += 1;
490 }
491 }
492}
493
494#[cfg(target_arch = "x86_64")]
495#[target_feature(enable = "ssse3")]
496unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
497 use std::arch::x86_64::*;
498
499 unsafe {
500 let len = src.len();
501 let sp = src.as_ptr();
502 let dp = dst.as_mut_ptr();
503
504 let mut lut = [_mm_setzero_si128(); 16];
505 for h in 0u8..16 {
506 let base = (h as usize) * 16;
507 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
508 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
509 }
510
511 let lo_mask = _mm_set1_epi8(0x0F);
512
513 let mut i = 0;
514 while i + 16 <= len {
515 let input = _mm_loadu_si128(sp.add(i) as *const _);
516 let lo_nibble = _mm_and_si128(input, lo_mask);
517 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
518
519 let mut result = _mm_setzero_si128();
520
521 macro_rules! do_nibble {
522 ($h:expr) => {
523 let h_val = _mm_set1_epi8($h as i8);
524 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
525 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
526 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
527 };
528 }
529 do_nibble!(0);
530 do_nibble!(1);
531 do_nibble!(2);
532 do_nibble!(3);
533 do_nibble!(4);
534 do_nibble!(5);
535 do_nibble!(6);
536 do_nibble!(7);
537 do_nibble!(8);
538 do_nibble!(9);
539 do_nibble!(10);
540 do_nibble!(11);
541 do_nibble!(12);
542 do_nibble!(13);
543 do_nibble!(14);
544 do_nibble!(15);
545
546 _mm_storeu_si128(dp.add(i) as *mut _, result);
547 i += 16;
548 }
549
550 while i < len {
552 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
553 i += 1;
554 }
555 }
556}
557
558#[inline]
566fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
567 let mut lo: Option<u8> = None;
568 let mut hi = 0u8;
569 let mut offset = 0i16;
570
571 for i in 0..256 {
572 if table[i] != i as u8 {
573 let diff = table[i] as i16 - i as i16;
574 match lo {
575 None => {
576 lo = Some(i as u8);
577 hi = i as u8;
578 offset = diff;
579 }
580 Some(_) => {
581 if diff != offset || i as u8 != hi.wrapping_add(1) {
582 return None;
583 }
584 hi = i as u8;
585 }
586 }
587 }
588 }
589
590 lo.map(|l| (l, hi, offset as i8))
591}
592
593#[cfg(target_arch = "x86_64")]
597fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
598 if is_x86_feature_detected!("avx2") {
599 unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
600 } else {
601 unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
602 }
603}
604
605#[cfg(target_arch = "x86_64")]
606#[target_feature(enable = "avx2")]
607unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
608 use std::arch::x86_64::*;
609
610 unsafe {
611 let range = hi - lo;
612 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
617 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
618 let offset_v = _mm256_set1_epi8(offset);
619 let zero = _mm256_setzero_si256();
620
621 let len = src.len();
622 let mut i = 0;
623
624 while i + 32 <= len {
625 let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
626 let biased = _mm256_add_epi8(input, bias_v);
627 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
629 let mask = _mm256_cmpeq_epi8(gt, zero);
631 let offset_masked = _mm256_and_si256(mask, offset_v);
632 let result = _mm256_add_epi8(input, offset_masked);
633 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
634 i += 32;
635 }
636
637 if i + 16 <= len {
639 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
640 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
641 let offset_v128 = _mm_set1_epi8(offset);
642 let zero128 = _mm_setzero_si128();
643
644 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
645 let biased = _mm_add_epi8(input, bias_v128);
646 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
647 let mask = _mm_cmpeq_epi8(gt, zero128);
648 let offset_masked = _mm_and_si128(mask, offset_v128);
649 let result = _mm_add_epi8(input, offset_masked);
650 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
651 i += 16;
652 }
653
654 while i < len {
656 let b = *src.get_unchecked(i);
657 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
658 b.wrapping_add(offset as u8)
659 } else {
660 b
661 };
662 i += 1;
663 }
664 }
665}
666
667#[cfg(target_arch = "x86_64")]
668#[target_feature(enable = "sse2")]
669unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
670 use std::arch::x86_64::*;
671
672 unsafe {
673 let range = hi - lo;
674 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
675 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
676 let offset_v = _mm_set1_epi8(offset);
677 let zero = _mm_setzero_si128();
678
679 let len = src.len();
680 let mut i = 0;
681
682 while i + 16 <= len {
683 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
684 let biased = _mm_add_epi8(input, bias_v);
685 let gt = _mm_cmpgt_epi8(biased, threshold_v);
686 let mask = _mm_cmpeq_epi8(gt, zero);
687 let offset_masked = _mm_and_si128(mask, offset_v);
688 let result = _mm_add_epi8(input, offset_masked);
689 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
690 i += 16;
691 }
692
693 while i < len {
694 let b = *src.get_unchecked(i);
695 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
696 b.wrapping_add(offset as u8)
697 } else {
698 b
699 };
700 i += 1;
701 }
702 }
703}
704
705#[cfg(not(target_arch = "x86_64"))]
707fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
708 for (i, &b) in src.iter().enumerate() {
709 dst[i] = if b >= lo && b <= hi {
710 b.wrapping_add(offset as u8)
711 } else {
712 b
713 };
714 }
715}
716
717#[cfg(target_arch = "x86_64")]
725fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
726 if is_x86_feature_detected!("avx2") {
727 unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
728 } else {
729 unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
730 }
731}
732
733#[cfg(target_arch = "x86_64")]
734#[target_feature(enable = "avx2")]
735unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
736 use std::arch::x86_64::*;
737
738 unsafe {
739 let range = hi - lo;
740 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
741 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
742 let offset_v = _mm256_set1_epi8(offset);
743 let zero = _mm256_setzero_si256();
744
745 let len = data.len();
746 let ptr = data.as_mut_ptr();
747 let mut i = 0;
748
749 while i + 32 <= len {
750 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
751 let biased = _mm256_add_epi8(input, bias_v);
752 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
753 let mask = _mm256_cmpeq_epi8(gt, zero);
754 let offset_masked = _mm256_and_si256(mask, offset_v);
755 let result = _mm256_add_epi8(input, offset_masked);
756 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
757 i += 32;
758 }
759
760 if i + 16 <= len {
761 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
762 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
763 let offset_v128 = _mm_set1_epi8(offset);
764 let zero128 = _mm_setzero_si128();
765
766 let input = _mm_loadu_si128(ptr.add(i) as *const _);
767 let biased = _mm_add_epi8(input, bias_v128);
768 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
769 let mask = _mm_cmpeq_epi8(gt, zero128);
770 let offset_masked = _mm_and_si128(mask, offset_v128);
771 let result = _mm_add_epi8(input, offset_masked);
772 _mm_storeu_si128(ptr.add(i) as *mut _, result);
773 i += 16;
774 }
775
776 while i < len {
777 let b = *ptr.add(i);
778 *ptr.add(i) = if b >= lo && b <= hi {
779 b.wrapping_add(offset as u8)
780 } else {
781 b
782 };
783 i += 1;
784 }
785 }
786}
787
788#[cfg(target_arch = "x86_64")]
789#[target_feature(enable = "sse2")]
790unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
791 use std::arch::x86_64::*;
792
793 unsafe {
794 let range = hi - lo;
795 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
796 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
797 let offset_v = _mm_set1_epi8(offset);
798 let zero = _mm_setzero_si128();
799
800 let len = data.len();
801 let ptr = data.as_mut_ptr();
802 let mut i = 0;
803
804 while i + 16 <= len {
805 let input = _mm_loadu_si128(ptr.add(i) as *const _);
806 let biased = _mm_add_epi8(input, bias_v);
807 let gt = _mm_cmpgt_epi8(biased, threshold_v);
808 let mask = _mm_cmpeq_epi8(gt, zero);
809 let offset_masked = _mm_and_si128(mask, offset_v);
810 let result = _mm_add_epi8(input, offset_masked);
811 _mm_storeu_si128(ptr.add(i) as *mut _, result);
812 i += 16;
813 }
814
815 while i < len {
816 let b = *ptr.add(i);
817 *ptr.add(i) = if b >= lo && b <= hi {
818 b.wrapping_add(offset as u8)
819 } else {
820 b
821 };
822 i += 1;
823 }
824 }
825}
826
827#[cfg(not(target_arch = "x86_64"))]
828fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
829 for b in data.iter_mut() {
830 if *b >= lo && *b <= hi {
831 *b = b.wrapping_add(offset as u8);
832 }
833 }
834}
835
836#[inline]
846fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
847 if chars.is_empty() {
848 return None;
849 }
850 let mut lo = chars[0];
851 let mut hi = chars[0];
852 for &c in &chars[1..] {
853 if c < lo {
854 lo = c;
855 }
856 if c > hi {
857 hi = c;
858 }
859 }
860 if (hi - lo + 1) as usize == chars.len() {
862 Some((lo, hi))
863 } else {
864 None
865 }
866}
867
868#[cfg(target_arch = "x86_64")]
872fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
873 if is_x86_feature_detected!("avx2") {
874 unsafe { delete_range_avx2(src, dst, lo, hi) }
875 } else {
876 unsafe { delete_range_sse2(src, dst, lo, hi) }
877 }
878}
879
880#[cfg(target_arch = "x86_64")]
881#[target_feature(enable = "avx2")]
882unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
883 use std::arch::x86_64::*;
884
885 unsafe {
886 let range = hi - lo;
887 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
888 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
889 let zero = _mm256_setzero_si256();
890
891 let len = src.len();
892 let sp = src.as_ptr();
893 let dp = dst.as_mut_ptr();
894 let mut ri = 0;
895 let mut wp = 0;
896
897 while ri + 32 <= len {
898 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
899 let biased = _mm256_add_epi8(input, bias_v);
900 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
902 let in_range = _mm256_cmpeq_epi8(gt, zero);
904 let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
906
907 let mut mask = keep_mask;
909 while mask != 0 {
910 let bit = mask.trailing_zeros() as usize;
911 *dp.add(wp) = *sp.add(ri + bit);
912 wp += 1;
913 mask &= mask - 1; }
915 ri += 32;
916 }
917
918 if ri + 16 <= len {
920 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
921 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
922 let zero128 = _mm_setzero_si128();
923
924 let input = _mm_loadu_si128(sp.add(ri) as *const _);
925 let biased = _mm_add_epi8(input, bias_v128);
926 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
927 let in_range = _mm_cmpeq_epi8(gt, zero128);
928 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
929
930 let mut mask = keep_mask;
931 while mask != 0 {
932 let bit = mask.trailing_zeros() as usize;
933 *dp.add(wp) = *sp.add(ri + bit);
934 wp += 1;
935 mask &= mask - 1;
936 }
937 ri += 16;
938 }
939
940 while ri < len {
942 let b = *sp.add(ri);
943 if b < lo || b > hi {
944 *dp.add(wp) = b;
945 wp += 1;
946 }
947 ri += 1;
948 }
949
950 wp
951 }
952}
953
954#[cfg(target_arch = "x86_64")]
955#[target_feature(enable = "sse2")]
956unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
957 use std::arch::x86_64::*;
958
959 unsafe {
960 let range = hi - lo;
961 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
962 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
963 let zero = _mm_setzero_si128();
964
965 let len = src.len();
966 let sp = src.as_ptr();
967 let dp = dst.as_mut_ptr();
968 let mut ri = 0;
969 let mut wp = 0;
970
971 while ri + 16 <= len {
972 let input = _mm_loadu_si128(sp.add(ri) as *const _);
973 let biased = _mm_add_epi8(input, bias_v);
974 let gt = _mm_cmpgt_epi8(biased, threshold_v);
975 let in_range = _mm_cmpeq_epi8(gt, zero);
976 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
977
978 let mut mask = keep_mask;
979 while mask != 0 {
980 let bit = mask.trailing_zeros() as usize;
981 *dp.add(wp) = *sp.add(ri + bit);
982 wp += 1;
983 mask &= mask - 1;
984 }
985 ri += 16;
986 }
987
988 while ri < len {
989 let b = *sp.add(ri);
990 if b < lo || b > hi {
991 *dp.add(wp) = b;
992 wp += 1;
993 }
994 ri += 1;
995 }
996
997 wp
998 }
999}
1000
1001#[cfg(not(target_arch = "x86_64"))]
1003fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1004 let mut wp = 0;
1005 for &b in src {
1006 if b < lo || b > hi {
1007 dst[wp] = b;
1008 wp += 1;
1009 }
1010 }
1011 wp
1012}
1013
1014fn delete_range_streaming(
1017 lo: u8,
1018 hi: u8,
1019 reader: &mut impl Read,
1020 writer: &mut impl Write,
1021) -> io::Result<()> {
1022 let mut src = vec![0u8; STREAM_BUF];
1023 let mut dst = alloc_uninit_vec(STREAM_BUF);
1024 loop {
1025 let n = read_full(reader, &mut src)?;
1026 if n == 0 {
1027 break;
1028 }
1029 let wp = delete_range_chunk(&src[..n], &mut dst, lo, hi);
1030 if wp > 0 {
1031 writer.write_all(&dst[..wp])?;
1032 }
1033 }
1034 Ok(())
1035}
1036
1037pub fn translate(
1042 set1: &[u8],
1043 set2: &[u8],
1044 reader: &mut impl Read,
1045 writer: &mut impl Write,
1046) -> io::Result<()> {
1047 let table = build_translate_table(set1, set2);
1048
1049 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1051 if is_identity {
1052 return passthrough_stream(reader, writer);
1053 }
1054
1055 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1057 return translate_range_stream(lo, hi, offset, reader, writer);
1058 }
1059
1060 let mut buf = vec![0u8; STREAM_BUF];
1068 loop {
1069 let n = read_full(reader, &mut buf)?;
1070 if n == 0 {
1071 break;
1072 }
1073 translate_inplace(&mut buf[..n], &table);
1074 writer.write_all(&buf[..n])?;
1075 }
1076 Ok(())
1077}
1078
1079fn translate_range_stream(
1082 lo: u8,
1083 hi: u8,
1084 offset: i8,
1085 reader: &mut impl Read,
1086 writer: &mut impl Write,
1087) -> io::Result<()> {
1088 let mut buf = vec![0u8; STREAM_BUF];
1089 loop {
1090 let n = read_full(reader, &mut buf)?;
1091 if n == 0 {
1092 break;
1093 }
1094 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1095 writer.write_all(&buf[..n])?;
1096 }
1097 Ok(())
1098}
1099
1100fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1103 let mut buf = vec![0u8; STREAM_BUF];
1104 loop {
1105 let n = read_full(reader, &mut buf)?;
1106 if n == 0 {
1107 break;
1108 }
1109 writer.write_all(&buf[..n])?;
1110 }
1111 Ok(())
1112}
1113
1114#[inline]
1117fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
1118 let n = reader.read(buf)?;
1120 if n == buf.len() || n == 0 {
1121 return Ok(n);
1122 }
1123 let mut total = n;
1125 while total < buf.len() {
1126 match reader.read(&mut buf[total..]) {
1127 Ok(0) => break,
1128 Ok(n) => total += n,
1129 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1130 Err(e) => return Err(e),
1131 }
1132 }
1133 Ok(total)
1134}
1135
1136pub fn translate_squeeze(
1137 set1: &[u8],
1138 set2: &[u8],
1139 reader: &mut impl Read,
1140 writer: &mut impl Write,
1141) -> io::Result<()> {
1142 let table = build_translate_table(set1, set2);
1143 let squeeze_set = build_member_set(set2);
1144
1145 let range_info = detect_range_offset(&table);
1151
1152 let mut buf = vec![0u8; STREAM_BUF];
1153 let mut last_squeezed: u16 = 256;
1154
1155 loop {
1156 let n = read_full(reader, &mut buf)?;
1157 if n == 0 {
1158 break;
1159 }
1160 if let Some((lo, hi, offset)) = range_info {
1162 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1163 } else {
1164 translate_inplace(&mut buf[..n], &table);
1165 }
1166 let mut wp = 0;
1168 unsafe {
1169 let ptr = buf.as_mut_ptr();
1170 for i in 0..n {
1171 let b = *ptr.add(i);
1172 if is_member(&squeeze_set, b) {
1173 if last_squeezed == b as u16 {
1174 continue;
1175 }
1176 last_squeezed = b as u16;
1177 } else {
1178 last_squeezed = 256;
1179 }
1180 *ptr.add(wp) = b;
1181 wp += 1;
1182 }
1183 }
1184 writer.write_all(&buf[..wp])?;
1185 }
1186 Ok(())
1187}
1188
1189pub fn delete(
1190 delete_chars: &[u8],
1191 reader: &mut impl Read,
1192 writer: &mut impl Write,
1193) -> io::Result<()> {
1194 if delete_chars.len() == 1 {
1195 return delete_single_streaming(delete_chars[0], reader, writer);
1196 }
1197 if delete_chars.len() <= 3 {
1198 return delete_multi_streaming(delete_chars, reader, writer);
1199 }
1200
1201 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1205 return delete_range_streaming(lo, hi, reader, writer);
1206 }
1207
1208 let member = build_member_set(delete_chars);
1209 let mut buf = vec![0u8; STREAM_BUF];
1210
1211 loop {
1212 let n = read_full(reader, &mut buf)?;
1213 if n == 0 {
1214 break;
1215 }
1216 let mut wp = 0;
1217 unsafe {
1218 let ptr = buf.as_mut_ptr();
1219 let mut i = 0;
1220 while i + 8 <= n {
1221 let b0 = *ptr.add(i);
1222 let b1 = *ptr.add(i + 1);
1223 let b2 = *ptr.add(i + 2);
1224 let b3 = *ptr.add(i + 3);
1225 let b4 = *ptr.add(i + 4);
1226 let b5 = *ptr.add(i + 5);
1227 let b6 = *ptr.add(i + 6);
1228 let b7 = *ptr.add(i + 7);
1229
1230 *ptr.add(wp) = b0;
1233 wp += !is_member(&member, b0) as usize;
1234 *ptr.add(wp) = b1;
1235 wp += !is_member(&member, b1) as usize;
1236 *ptr.add(wp) = b2;
1237 wp += !is_member(&member, b2) as usize;
1238 *ptr.add(wp) = b3;
1239 wp += !is_member(&member, b3) as usize;
1240 *ptr.add(wp) = b4;
1241 wp += !is_member(&member, b4) as usize;
1242 *ptr.add(wp) = b5;
1243 wp += !is_member(&member, b5) as usize;
1244 *ptr.add(wp) = b6;
1245 wp += !is_member(&member, b6) as usize;
1246 *ptr.add(wp) = b7;
1247 wp += !is_member(&member, b7) as usize;
1248 i += 8;
1249 }
1250 while i < n {
1251 let b = *ptr.add(i);
1252 *ptr.add(wp) = b;
1253 wp += !is_member(&member, b) as usize;
1254 i += 1;
1255 }
1256 }
1257 writer.write_all(&buf[..wp])?;
1258 }
1259 Ok(())
1260}
1261
1262fn delete_single_streaming(
1263 ch: u8,
1264 reader: &mut impl Read,
1265 writer: &mut impl Write,
1266) -> io::Result<()> {
1267 let mut buf = vec![0u8; STREAM_BUF];
1268 loop {
1269 let n = read_full(reader, &mut buf)?;
1270 if n == 0 {
1271 break;
1272 }
1273 let mut wp = 0;
1274 let mut i = 0;
1275 while i < n {
1276 match memchr::memchr(ch, &buf[i..n]) {
1277 Some(offset) => {
1278 if offset > 0 {
1279 if wp != i {
1280 unsafe {
1281 std::ptr::copy(
1282 buf.as_ptr().add(i),
1283 buf.as_mut_ptr().add(wp),
1284 offset,
1285 );
1286 }
1287 }
1288 wp += offset;
1289 }
1290 i += offset + 1;
1291 }
1292 None => {
1293 let run_len = n - i;
1294 if run_len > 0 {
1295 if wp != i {
1296 unsafe {
1297 std::ptr::copy(
1298 buf.as_ptr().add(i),
1299 buf.as_mut_ptr().add(wp),
1300 run_len,
1301 );
1302 }
1303 }
1304 wp += run_len;
1305 }
1306 break;
1307 }
1308 }
1309 }
1310 writer.write_all(&buf[..wp])?;
1311 }
1312 Ok(())
1313}
1314
1315fn delete_multi_streaming(
1316 chars: &[u8],
1317 reader: &mut impl Read,
1318 writer: &mut impl Write,
1319) -> io::Result<()> {
1320 let mut buf = vec![0u8; STREAM_BUF];
1321 loop {
1322 let n = read_full(reader, &mut buf)?;
1323 if n == 0 {
1324 break;
1325 }
1326 let mut wp = 0;
1327 let mut i = 0;
1328 while i < n {
1329 let found = if chars.len() == 2 {
1330 memchr::memchr2(chars[0], chars[1], &buf[i..n])
1331 } else {
1332 memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
1333 };
1334 match found {
1335 Some(offset) => {
1336 if offset > 0 {
1337 if wp != i {
1338 unsafe {
1339 std::ptr::copy(
1340 buf.as_ptr().add(i),
1341 buf.as_mut_ptr().add(wp),
1342 offset,
1343 );
1344 }
1345 }
1346 wp += offset;
1347 }
1348 i += offset + 1;
1349 }
1350 None => {
1351 let run_len = n - i;
1352 if run_len > 0 {
1353 if wp != i {
1354 unsafe {
1355 std::ptr::copy(
1356 buf.as_ptr().add(i),
1357 buf.as_mut_ptr().add(wp),
1358 run_len,
1359 );
1360 }
1361 }
1362 wp += run_len;
1363 }
1364 break;
1365 }
1366 }
1367 }
1368 writer.write_all(&buf[..wp])?;
1369 }
1370 Ok(())
1371}
1372
1373pub fn delete_squeeze(
1374 delete_chars: &[u8],
1375 squeeze_chars: &[u8],
1376 reader: &mut impl Read,
1377 writer: &mut impl Write,
1378) -> io::Result<()> {
1379 let delete_set = build_member_set(delete_chars);
1380 let squeeze_set = build_member_set(squeeze_chars);
1381 let mut buf = vec![0u8; STREAM_BUF];
1382 let mut last_squeezed: u16 = 256;
1383
1384 loop {
1385 let n = read_full(reader, &mut buf)?;
1386 if n == 0 {
1387 break;
1388 }
1389 let mut wp = 0;
1390 unsafe {
1391 let ptr = buf.as_mut_ptr();
1392 for i in 0..n {
1393 let b = *ptr.add(i);
1394 if is_member(&delete_set, b) {
1395 continue;
1396 }
1397 if is_member(&squeeze_set, b) {
1398 if last_squeezed == b as u16 {
1399 continue;
1400 }
1401 last_squeezed = b as u16;
1402 } else {
1403 last_squeezed = 256;
1404 }
1405 *ptr.add(wp) = b;
1406 wp += 1;
1407 }
1408 }
1409 writer.write_all(&buf[..wp])?;
1410 }
1411 Ok(())
1412}
1413
1414pub fn squeeze(
1415 squeeze_chars: &[u8],
1416 reader: &mut impl Read,
1417 writer: &mut impl Write,
1418) -> io::Result<()> {
1419 if squeeze_chars.len() == 1 {
1420 return squeeze_single_stream(squeeze_chars[0], reader, writer);
1421 }
1422
1423 let member = build_member_set(squeeze_chars);
1424 let mut buf = vec![0u8; STREAM_BUF];
1425 let mut last_squeezed: u16 = 256;
1426
1427 loop {
1428 let n = read_full(reader, &mut buf)?;
1429 if n == 0 {
1430 break;
1431 }
1432 let mut wp = 0;
1433 unsafe {
1434 let ptr = buf.as_mut_ptr();
1435 for i in 0..n {
1436 let b = *ptr.add(i);
1437 if is_member(&member, b) {
1438 if last_squeezed == b as u16 {
1439 continue;
1440 }
1441 last_squeezed = b as u16;
1442 } else {
1443 last_squeezed = 256;
1444 }
1445 *ptr.add(wp) = b;
1446 wp += 1;
1447 }
1448 }
1449 writer.write_all(&buf[..wp])?;
1450 }
1451 Ok(())
1452}
1453
1454fn squeeze_single_stream(
1455 ch: u8,
1456 reader: &mut impl Read,
1457 writer: &mut impl Write,
1458) -> io::Result<()> {
1459 let mut buf = vec![0u8; STREAM_BUF];
1460 let mut was_squeeze_char = false;
1461
1462 loop {
1463 let n = read_full(reader, &mut buf)?;
1464 if n == 0 {
1465 break;
1466 }
1467
1468 let mut wp = 0;
1469 let mut i = 0;
1470
1471 while i < n {
1472 if was_squeeze_char && buf[i] == ch {
1473 i += 1;
1474 while i < n && buf[i] == ch {
1475 i += 1;
1476 }
1477 if i >= n {
1478 break;
1479 }
1480 }
1481
1482 match memchr::memchr(ch, &buf[i..n]) {
1483 Some(offset) => {
1484 let run_len = offset;
1485 if run_len > 0 {
1486 if wp != i {
1487 unsafe {
1488 std::ptr::copy(
1489 buf.as_ptr().add(i),
1490 buf.as_mut_ptr().add(wp),
1491 run_len,
1492 );
1493 }
1494 }
1495 wp += run_len;
1496 }
1497 i += run_len;
1498
1499 unsafe {
1500 *buf.as_mut_ptr().add(wp) = ch;
1501 }
1502 wp += 1;
1503 was_squeeze_char = true;
1504 i += 1;
1505 while i < n && buf[i] == ch {
1506 i += 1;
1507 }
1508 }
1509 None => {
1510 let run_len = n - i;
1511 if run_len > 0 {
1512 if wp != i {
1513 unsafe {
1514 std::ptr::copy(
1515 buf.as_ptr().add(i),
1516 buf.as_mut_ptr().add(wp),
1517 run_len,
1518 );
1519 }
1520 }
1521 wp += run_len;
1522 }
1523 was_squeeze_char = false;
1524 break;
1525 }
1526 }
1527 }
1528
1529 writer.write_all(&buf[..wp])?;
1530 }
1531 Ok(())
1532}
1533
1534const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
1542
1543pub fn translate_mmap(
1552 set1: &[u8],
1553 set2: &[u8],
1554 data: &[u8],
1555 writer: &mut impl Write,
1556) -> io::Result<()> {
1557 let table = build_translate_table(set1, set2);
1558
1559 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1561 if is_identity {
1562 return writer.write_all(data);
1563 }
1564
1565 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1567 return translate_mmap_range(data, writer, lo, hi, offset);
1568 }
1569
1570 translate_mmap_table(data, writer, &table)
1572}
1573
1574fn translate_mmap_range(
1576 data: &[u8],
1577 writer: &mut impl Write,
1578 lo: u8,
1579 hi: u8,
1580 offset: i8,
1581) -> io::Result<()> {
1582 if data.len() >= PARALLEL_THRESHOLD {
1584 let mut buf = alloc_uninit_vec(data.len());
1585 let n_threads = rayon::current_num_threads().max(1);
1586 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1587
1588 data.par_chunks(chunk_size)
1590 .zip(buf.par_chunks_mut(chunk_size))
1591 .for_each(|(src_chunk, dst_chunk)| {
1592 translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
1593 });
1594
1595 return writer.write_all(&buf);
1596 }
1597
1598 if data.len() <= SINGLE_WRITE_LIMIT {
1600 let mut buf = alloc_uninit_vec(data.len());
1601 translate_range_simd(data, &mut buf, lo, hi, offset);
1602 return writer.write_all(&buf);
1603 }
1604 let mut buf = alloc_uninit_vec(BUF_SIZE);
1606 for chunk in data.chunks(BUF_SIZE) {
1607 translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
1608 writer.write_all(&buf[..chunk.len()])?;
1609 }
1610 Ok(())
1611}
1612
1613fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
1615 if data.len() >= PARALLEL_THRESHOLD {
1617 let mut buf = alloc_uninit_vec(data.len());
1618 let n_threads = rayon::current_num_threads().max(1);
1619 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1620
1621 data.par_chunks(chunk_size)
1622 .zip(buf.par_chunks_mut(chunk_size))
1623 .for_each(|(src_chunk, dst_chunk)| {
1624 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
1625 });
1626
1627 return writer.write_all(&buf);
1628 }
1629
1630 if data.len() <= SINGLE_WRITE_LIMIT {
1632 let mut buf = alloc_uninit_vec(data.len());
1633 translate_to(data, &mut buf, table);
1634 return writer.write_all(&buf);
1635 }
1636 let mut buf = alloc_uninit_vec(BUF_SIZE);
1637 for chunk in data.chunks(BUF_SIZE) {
1638 translate_to(chunk, &mut buf[..chunk.len()], table);
1639 writer.write_all(&buf[..chunk.len()])?;
1640 }
1641 Ok(())
1642}
1643
1644pub fn translate_squeeze_mmap(
1650 set1: &[u8],
1651 set2: &[u8],
1652 data: &[u8],
1653 writer: &mut impl Write,
1654) -> io::Result<()> {
1655 let table = build_translate_table(set1, set2);
1656 let squeeze_set = build_member_set(set2);
1657
1658 if data.len() >= PARALLEL_THRESHOLD {
1663 let mut translated = alloc_uninit_vec(data.len());
1665 let range_info = detect_range_offset(&table);
1666 let n_threads = rayon::current_num_threads().max(1);
1667 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1668
1669 if let Some((lo, hi, offset)) = range_info {
1670 data.par_chunks(chunk_size)
1671 .zip(translated.par_chunks_mut(chunk_size))
1672 .for_each(|(src_chunk, dst_chunk)| {
1673 translate_range_simd(
1674 src_chunk,
1675 &mut dst_chunk[..src_chunk.len()],
1676 lo,
1677 hi,
1678 offset,
1679 );
1680 });
1681 } else {
1682 data.par_chunks(chunk_size)
1683 .zip(translated.par_chunks_mut(chunk_size))
1684 .for_each(|(src_chunk, dst_chunk)| {
1685 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
1686 });
1687 }
1688
1689 let mut last_squeezed: u16 = 256;
1693 let len = translated.len();
1694 let mut wp = 0;
1695 unsafe {
1696 let ptr = translated.as_mut_ptr();
1697 let mut i = 0;
1698 while i < len {
1699 let b = *ptr.add(i);
1700 if is_member(&squeeze_set, b) {
1701 if last_squeezed == b as u16 {
1702 i += 1;
1703 continue;
1704 }
1705 last_squeezed = b as u16;
1706 } else {
1707 last_squeezed = 256;
1708 }
1709 *ptr.add(wp) = b;
1710 wp += 1;
1711 i += 1;
1712 }
1713 }
1714 return writer.write_all(&translated[..wp]);
1715 }
1716
1717 if data.len() <= SINGLE_WRITE_LIMIT {
1719 let mut buf: Vec<u8> = Vec::with_capacity(data.len());
1720 let mut last_squeezed: u16 = 256;
1721 unsafe {
1722 buf.set_len(data.len());
1723 let outp: *mut u8 = buf.as_mut_ptr();
1724 let inp = data.as_ptr();
1725 let len = data.len();
1726 let mut wp = 0;
1727 let mut i = 0;
1728 while i < len {
1729 let translated = *table.get_unchecked(*inp.add(i) as usize);
1730 if is_member(&squeeze_set, translated) {
1731 if last_squeezed == translated as u16 {
1732 i += 1;
1733 continue;
1734 }
1735 last_squeezed = translated as u16;
1736 } else {
1737 last_squeezed = 256;
1738 }
1739 *outp.add(wp) = translated;
1740 wp += 1;
1741 i += 1;
1742 }
1743 buf.set_len(wp);
1744 }
1745 return writer.write_all(&buf);
1746 }
1747
1748 let buf_size = data.len().min(BUF_SIZE);
1750 let mut buf = vec![0u8; buf_size];
1751 let mut last_squeezed: u16 = 256;
1752
1753 for chunk in data.chunks(buf_size) {
1754 translate_to(chunk, &mut buf[..chunk.len()], &table);
1755 let mut wp = 0;
1756 unsafe {
1757 let ptr = buf.as_mut_ptr();
1758 for i in 0..chunk.len() {
1759 let b = *ptr.add(i);
1760 if is_member(&squeeze_set, b) {
1761 if last_squeezed == b as u16 {
1762 continue;
1763 }
1764 last_squeezed = b as u16;
1765 } else {
1766 last_squeezed = 256;
1767 }
1768 *ptr.add(wp) = b;
1769 wp += 1;
1770 }
1771 }
1772 writer.write_all(&buf[..wp])?;
1773 }
1774 Ok(())
1775}
1776
1777pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1783 if delete_chars.len() == 1 {
1784 return delete_single_char_mmap(delete_chars[0], data, writer);
1785 }
1786 if delete_chars.len() <= 3 {
1787 return delete_multi_memchr_mmap(delete_chars, data, writer);
1788 }
1789
1790 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1792 return delete_range_mmap(data, writer, lo, hi);
1793 }
1794
1795 let member = build_member_set(delete_chars);
1796
1797 if data.len() >= PARALLEL_THRESHOLD {
1801 let n_threads = rayon::current_num_threads().max(1);
1802 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1803
1804 let mut outbuf = alloc_uninit_vec(data.len());
1806 let chunk_lens: Vec<usize> = data
1807 .par_chunks(chunk_size)
1808 .zip(outbuf.par_chunks_mut(chunk_size))
1809 .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
1810 .collect();
1811
1812 let mut write_pos = 0;
1816 let mut src_offset = 0;
1817 for &clen in &chunk_lens {
1818 if clen > 0 && src_offset != write_pos {
1819 unsafe {
1820 std::ptr::copy(
1821 outbuf.as_ptr().add(src_offset),
1822 outbuf.as_mut_ptr().add(write_pos),
1823 clen,
1824 );
1825 }
1826 }
1827 write_pos += clen;
1828 src_offset += chunk_size;
1829 }
1830
1831 return writer.write_all(&outbuf[..write_pos]);
1832 }
1833
1834 if data.len() <= SINGLE_WRITE_LIMIT {
1836 let mut outbuf = alloc_uninit_vec(data.len());
1837 let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
1838 return writer.write_all(&outbuf[..out_pos]);
1839 }
1840
1841 let buf_size = data.len().min(BUF_SIZE);
1843 let mut outbuf = alloc_uninit_vec(buf_size);
1844
1845 for chunk in data.chunks(buf_size) {
1846 let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
1847 writer.write_all(&outbuf[..out_pos])?;
1848 }
1849 Ok(())
1850}
1851
1852fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
1854 if data.len() >= PARALLEL_THRESHOLD {
1856 let n_threads = rayon::current_num_threads().max(1);
1857 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1858
1859 let results: Vec<Vec<u8>> = data
1860 .par_chunks(chunk_size)
1861 .map(|chunk| {
1862 let mut out = alloc_uninit_vec(chunk.len());
1863 let wp = delete_range_chunk(chunk, &mut out, lo, hi);
1864 unsafe { out.set_len(wp) };
1865 out
1866 })
1867 .collect();
1868
1869 let slices: Vec<std::io::IoSlice> = results
1870 .iter()
1871 .filter(|r| !r.is_empty())
1872 .map(|r| std::io::IoSlice::new(r))
1873 .collect();
1874 return write_ioslices(writer, &slices);
1875 }
1876
1877 if data.len() <= SINGLE_WRITE_LIMIT {
1879 let mut outbuf = alloc_uninit_vec(data.len());
1880 let wp = delete_range_chunk(data, &mut outbuf, lo, hi);
1881 return writer.write_all(&outbuf[..wp]);
1882 }
1883
1884 let mut outbuf = alloc_uninit_vec(BUF_SIZE);
1886 for chunk in data.chunks(BUF_SIZE) {
1887 let wp = delete_range_chunk(chunk, &mut outbuf, lo, hi);
1888 writer.write_all(&outbuf[..wp])?;
1889 }
1890 Ok(())
1891}
1892
1893#[inline]
1896fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
1897 let len = chunk.len();
1898 let mut out_pos = 0;
1899 let mut i = 0;
1900
1901 while i + 8 <= len {
1902 unsafe {
1903 let b0 = *chunk.get_unchecked(i);
1904 let b1 = *chunk.get_unchecked(i + 1);
1905 let b2 = *chunk.get_unchecked(i + 2);
1906 let b3 = *chunk.get_unchecked(i + 3);
1907 let b4 = *chunk.get_unchecked(i + 4);
1908 let b5 = *chunk.get_unchecked(i + 5);
1909 let b6 = *chunk.get_unchecked(i + 6);
1910 let b7 = *chunk.get_unchecked(i + 7);
1911
1912 *outbuf.get_unchecked_mut(out_pos) = b0;
1913 out_pos += !is_member(member, b0) as usize;
1914 *outbuf.get_unchecked_mut(out_pos) = b1;
1915 out_pos += !is_member(member, b1) as usize;
1916 *outbuf.get_unchecked_mut(out_pos) = b2;
1917 out_pos += !is_member(member, b2) as usize;
1918 *outbuf.get_unchecked_mut(out_pos) = b3;
1919 out_pos += !is_member(member, b3) as usize;
1920 *outbuf.get_unchecked_mut(out_pos) = b4;
1921 out_pos += !is_member(member, b4) as usize;
1922 *outbuf.get_unchecked_mut(out_pos) = b5;
1923 out_pos += !is_member(member, b5) as usize;
1924 *outbuf.get_unchecked_mut(out_pos) = b6;
1925 out_pos += !is_member(member, b6) as usize;
1926 *outbuf.get_unchecked_mut(out_pos) = b7;
1927 out_pos += !is_member(member, b7) as usize;
1928 }
1929 i += 8;
1930 }
1931
1932 while i < len {
1933 unsafe {
1934 let b = *chunk.get_unchecked(i);
1935 *outbuf.get_unchecked_mut(out_pos) = b;
1936 out_pos += !is_member(member, b) as usize;
1937 }
1938 i += 1;
1939 }
1940
1941 out_pos
1942}
1943
1944fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1945 if data.len() >= PARALLEL_THRESHOLD {
1948 let n_threads = rayon::current_num_threads().max(1);
1949 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1950
1951 let results: Vec<Vec<u8>> = data
1952 .par_chunks(chunk_size)
1953 .map(|chunk| {
1954 let mut out = Vec::with_capacity(chunk.len());
1955 let mut last = 0;
1956 for pos in memchr::memchr_iter(ch, chunk) {
1957 if pos > last {
1958 out.extend_from_slice(&chunk[last..pos]);
1959 }
1960 last = pos + 1;
1961 }
1962 if last < chunk.len() {
1963 out.extend_from_slice(&chunk[last..]);
1964 }
1965 out
1966 })
1967 .collect();
1968
1969 let slices: Vec<std::io::IoSlice> = results
1971 .iter()
1972 .filter(|r| !r.is_empty())
1973 .map(|r| std::io::IoSlice::new(r))
1974 .collect();
1975 return write_ioslices(writer, &slices);
1976 }
1977
1978 if data.len() <= SINGLE_WRITE_LIMIT {
1980 let mut outbuf = Vec::with_capacity(data.len());
1981 let mut last = 0;
1982 for pos in memchr::memchr_iter(ch, data) {
1983 if pos > last {
1984 outbuf.extend_from_slice(&data[last..pos]);
1985 }
1986 last = pos + 1;
1987 }
1988 if last < data.len() {
1989 outbuf.extend_from_slice(&data[last..]);
1990 }
1991 return writer.write_all(&outbuf);
1992 }
1993
1994 let buf_size = data.len().min(BUF_SIZE);
1996 let mut outbuf = vec![0u8; buf_size];
1997
1998 for chunk in data.chunks(buf_size) {
1999 let mut wp = 0;
2000 let mut last = 0;
2001 for pos in memchr::memchr_iter(ch, chunk) {
2002 if pos > last {
2003 let run = pos - last;
2004 outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2005 wp += run;
2006 }
2007 last = pos + 1;
2008 }
2009 if last < chunk.len() {
2010 let run = chunk.len() - last;
2011 outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2012 wp += run;
2013 }
2014 writer.write_all(&outbuf[..wp])?;
2015 }
2016 Ok(())
2017}
2018
2019fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2020 let c0 = chars[0];
2021 let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
2022 let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
2023 let is_three = chars.len() >= 3;
2024
2025 if data.len() >= PARALLEL_THRESHOLD {
2027 let n_threads = rayon::current_num_threads().max(1);
2028 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2029
2030 let results: Vec<Vec<u8>> = data
2031 .par_chunks(chunk_size)
2032 .map(|chunk| {
2033 let mut out = Vec::with_capacity(chunk.len());
2034 let mut last = 0;
2035 if is_three {
2036 for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2037 if pos > last {
2038 out.extend_from_slice(&chunk[last..pos]);
2039 }
2040 last = pos + 1;
2041 }
2042 } else {
2043 for pos in memchr::memchr2_iter(c0, c1, chunk) {
2044 if pos > last {
2045 out.extend_from_slice(&chunk[last..pos]);
2046 }
2047 last = pos + 1;
2048 }
2049 }
2050 if last < chunk.len() {
2051 out.extend_from_slice(&chunk[last..]);
2052 }
2053 out
2054 })
2055 .collect();
2056
2057 let slices: Vec<std::io::IoSlice> = results
2059 .iter()
2060 .filter(|r| !r.is_empty())
2061 .map(|r| std::io::IoSlice::new(r))
2062 .collect();
2063 return write_ioslices(writer, &slices);
2064 }
2065
2066 if data.len() <= SINGLE_WRITE_LIMIT {
2068 let mut outbuf = Vec::with_capacity(data.len());
2069 let mut last = 0;
2070 if is_three {
2071 for pos in memchr::memchr3_iter(c0, c1, c2, data) {
2072 if pos > last {
2073 outbuf.extend_from_slice(&data[last..pos]);
2074 }
2075 last = pos + 1;
2076 }
2077 } else {
2078 for pos in memchr::memchr2_iter(c0, c1, data) {
2079 if pos > last {
2080 outbuf.extend_from_slice(&data[last..pos]);
2081 }
2082 last = pos + 1;
2083 }
2084 }
2085 if last < data.len() {
2086 outbuf.extend_from_slice(&data[last..]);
2087 }
2088 return writer.write_all(&outbuf);
2089 }
2090
2091 let buf_size = data.len().min(BUF_SIZE);
2093 let mut outbuf = vec![0u8; buf_size];
2094
2095 for chunk in data.chunks(buf_size) {
2096 let mut wp = 0;
2097 let mut last = 0;
2098
2099 if is_three {
2102 for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2103 if pos > last {
2104 let run = pos - last;
2105 outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2106 wp += run;
2107 }
2108 last = pos + 1;
2109 }
2110 } else {
2111 for pos in memchr::memchr2_iter(c0, c1, chunk) {
2112 if pos > last {
2113 let run = pos - last;
2114 outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2115 wp += run;
2116 }
2117 last = pos + 1;
2118 }
2119 }
2120
2121 if last < chunk.len() {
2122 let run = chunk.len() - last;
2123 outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2124 wp += run;
2125 }
2126 writer.write_all(&outbuf[..wp])?;
2127 }
2128 Ok(())
2129}
2130
2131pub fn delete_squeeze_mmap(
2136 delete_chars: &[u8],
2137 squeeze_chars: &[u8],
2138 data: &[u8],
2139 writer: &mut impl Write,
2140) -> io::Result<()> {
2141 let delete_set = build_member_set(delete_chars);
2142 let squeeze_set = build_member_set(squeeze_chars);
2143
2144 if data.len() <= SINGLE_WRITE_LIMIT {
2146 let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2147 let mut last_squeezed: u16 = 256;
2148 unsafe {
2149 outbuf.set_len(data.len());
2150 let outp: *mut u8 = outbuf.as_mut_ptr();
2151 let inp = data.as_ptr();
2152 let len = data.len();
2153 let mut out_pos = 0;
2154 let mut i = 0;
2155 while i < len {
2156 let b = *inp.add(i);
2157 if is_member(&delete_set, b) {
2158 i += 1;
2159 continue;
2160 }
2161 if is_member(&squeeze_set, b) {
2162 if last_squeezed == b as u16 {
2163 i += 1;
2164 continue;
2165 }
2166 last_squeezed = b as u16;
2167 } else {
2168 last_squeezed = 256;
2169 }
2170 *outp.add(out_pos) = b;
2171 out_pos += 1;
2172 i += 1;
2173 }
2174 outbuf.set_len(out_pos);
2175 }
2176 return writer.write_all(&outbuf);
2177 }
2178
2179 let buf_size = data.len().min(BUF_SIZE);
2181 let mut outbuf = vec![0u8; buf_size];
2182 let mut last_squeezed: u16 = 256;
2183
2184 for chunk in data.chunks(buf_size) {
2185 let mut out_pos = 0;
2186 for &b in chunk {
2187 if is_member(&delete_set, b) {
2188 continue;
2189 }
2190 if is_member(&squeeze_set, b) {
2191 if last_squeezed == b as u16 {
2192 continue;
2193 }
2194 last_squeezed = b as u16;
2195 } else {
2196 last_squeezed = 256;
2197 }
2198 unsafe {
2199 *outbuf.get_unchecked_mut(out_pos) = b;
2200 }
2201 out_pos += 1;
2202 }
2203 writer.write_all(&outbuf[..out_pos])?;
2204 }
2205 Ok(())
2206}
2207
2208pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2214 if squeeze_chars.len() == 1 {
2215 return squeeze_single_mmap(squeeze_chars[0], data, writer);
2216 }
2217 if squeeze_chars.len() == 2 {
2218 return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
2219 }
2220 if squeeze_chars.len() == 3 {
2221 return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
2222 }
2223
2224 let member = build_member_set(squeeze_chars);
2225
2226 if data.len() >= PARALLEL_THRESHOLD {
2228 let n_threads = rayon::current_num_threads().max(1);
2229 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2230
2231 let results: Vec<Vec<u8>> = data
2232 .par_chunks(chunk_size)
2233 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2234 .collect();
2235
2236 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2241 for (idx, result) in results.iter().enumerate() {
2242 if result.is_empty() {
2243 continue;
2244 }
2245 if idx > 0 {
2246 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2248 if is_member(&member, prev_last) {
2249 let skip = result.iter().take_while(|&&b| b == prev_last).count();
2251 if skip < result.len() {
2252 slices.push(std::io::IoSlice::new(&result[skip..]));
2253 }
2254 continue;
2255 }
2256 }
2257 }
2258 slices.push(std::io::IoSlice::new(result));
2259 }
2260 return write_ioslices(writer, &slices);
2261 }
2262
2263 if data.len() <= SINGLE_WRITE_LIMIT {
2265 let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2266 let mut last_squeezed: u16 = 256;
2267 let len = data.len();
2268 let mut wp = 0;
2269 let mut i = 0;
2270
2271 unsafe {
2272 outbuf.set_len(data.len());
2273 let inp = data.as_ptr();
2274 let outp: *mut u8 = outbuf.as_mut_ptr();
2275
2276 while i < len {
2277 let b = *inp.add(i);
2278 if is_member(&member, b) {
2279 if last_squeezed != b as u16 {
2280 *outp.add(wp) = b;
2281 wp += 1;
2282 last_squeezed = b as u16;
2283 }
2284 i += 1;
2285 while i < len && *inp.add(i) == b {
2286 i += 1;
2287 }
2288 } else {
2289 last_squeezed = 256;
2290 *outp.add(wp) = b;
2291 wp += 1;
2292 i += 1;
2293 }
2294 }
2295 outbuf.set_len(wp);
2296 }
2297 return writer.write_all(&outbuf);
2298 }
2299
2300 let buf_size = data.len().min(BUF_SIZE);
2302 let mut outbuf = vec![0u8; buf_size];
2303 let mut last_squeezed: u16 = 256;
2304
2305 for chunk in data.chunks(buf_size) {
2306 let len = chunk.len();
2307 let mut wp = 0;
2308 let mut i = 0;
2309
2310 unsafe {
2311 let inp = chunk.as_ptr();
2312 let outp = outbuf.as_mut_ptr();
2313
2314 while i < len {
2315 let b = *inp.add(i);
2316 if is_member(&member, b) {
2317 if last_squeezed != b as u16 {
2318 *outp.add(wp) = b;
2319 wp += 1;
2320 last_squeezed = b as u16;
2321 }
2322 i += 1;
2323 while i < len && *inp.add(i) == b {
2324 i += 1;
2325 }
2326 } else {
2327 last_squeezed = 256;
2328 *outp.add(wp) = b;
2329 wp += 1;
2330 i += 1;
2331 }
2332 }
2333 }
2334 writer.write_all(&outbuf[..wp])?;
2335 }
2336 Ok(())
2337}
2338
2339fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
2341 let len = chunk.len();
2342 let mut out = Vec::with_capacity(len);
2343 let mut last_squeezed: u16 = 256;
2344 let mut i = 0;
2345
2346 unsafe {
2347 out.set_len(len);
2348 let inp = chunk.as_ptr();
2349 let outp: *mut u8 = out.as_mut_ptr();
2350 let mut wp = 0;
2351
2352 while i < len {
2353 let b = *inp.add(i);
2354 if is_member(member, b) {
2355 if last_squeezed != b as u16 {
2356 *outp.add(wp) = b;
2357 wp += 1;
2358 last_squeezed = b as u16;
2359 }
2360 i += 1;
2361 while i < len && *inp.add(i) == b {
2362 i += 1;
2363 }
2364 } else {
2365 last_squeezed = 256;
2366 *outp.add(wp) = b;
2367 wp += 1;
2368 i += 1;
2369 }
2370 }
2371 out.set_len(wp);
2372 }
2373 out
2374}
2375
2376fn squeeze_multi_mmap<const N: usize>(
2377 chars: &[u8],
2378 data: &[u8],
2379 writer: &mut impl Write,
2380) -> io::Result<()> {
2381 if data.len() >= PARALLEL_THRESHOLD {
2383 let member = build_member_set(chars);
2384 let n_threads = rayon::current_num_threads().max(1);
2385 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2386
2387 let results: Vec<Vec<u8>> = data
2388 .par_chunks(chunk_size)
2389 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2390 .collect();
2391
2392 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2394 for (idx, result) in results.iter().enumerate() {
2395 if result.is_empty() {
2396 continue;
2397 }
2398 if idx > 0 {
2399 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2400 if is_member(&member, prev_last) {
2401 let skip = result.iter().take_while(|&&b| b == prev_last).count();
2402 if skip < result.len() {
2403 slices.push(std::io::IoSlice::new(&result[skip..]));
2404 }
2405 continue;
2406 }
2407 }
2408 }
2409 slices.push(std::io::IoSlice::new(result));
2410 }
2411 return write_ioslices(writer, &slices);
2412 }
2413
2414 let buf_size = data.len().min(BUF_SIZE);
2415 let mut outbuf = vec![0u8; buf_size];
2416 let mut wp = 0;
2417 let mut last_squeezed: u16 = 256;
2418 let mut cursor = 0;
2419
2420 macro_rules! find_next {
2421 ($data:expr) => {
2422 if N == 2 {
2423 memchr::memchr2(chars[0], chars[1], $data)
2424 } else {
2425 memchr::memchr3(chars[0], chars[1], chars[2], $data)
2426 }
2427 };
2428 }
2429
2430 macro_rules! flush_and_copy {
2431 ($src:expr, $len:expr) => {
2432 if wp + $len > buf_size {
2433 writer.write_all(&outbuf[..wp])?;
2434 wp = 0;
2435 }
2436 if $len > buf_size {
2437 writer.write_all($src)?;
2438 } else {
2439 outbuf[wp..wp + $len].copy_from_slice($src);
2440 wp += $len;
2441 }
2442 };
2443 }
2444
2445 while cursor < data.len() {
2446 match find_next!(&data[cursor..]) {
2447 Some(offset) => {
2448 let pos = cursor + offset;
2449 let b = data[pos];
2450 if pos > cursor {
2451 let span = pos - cursor;
2452 flush_and_copy!(&data[cursor..pos], span);
2453 last_squeezed = 256;
2454 }
2455 if last_squeezed != b as u16 {
2456 if wp >= buf_size {
2457 writer.write_all(&outbuf[..wp])?;
2458 wp = 0;
2459 }
2460 outbuf[wp] = b;
2461 wp += 1;
2462 last_squeezed = b as u16;
2463 }
2464 let mut skip = pos + 1;
2465 while skip < data.len() && data[skip] == b {
2466 skip += 1;
2467 }
2468 cursor = skip;
2469 }
2470 None => {
2471 let remaining = data.len() - cursor;
2472 flush_and_copy!(&data[cursor..], remaining);
2473 break;
2474 }
2475 }
2476 }
2477 if wp > 0 {
2478 writer.write_all(&outbuf[..wp])?;
2479 }
2480 Ok(())
2481}
2482
2483fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2484 if data.is_empty() {
2485 return Ok(());
2486 }
2487
2488 if memchr::memmem::find(data, &[ch, ch]).is_none() {
2489 return writer.write_all(data);
2490 }
2491
2492 if data.len() >= PARALLEL_THRESHOLD {
2494 let n_threads = rayon::current_num_threads().max(1);
2495 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2496
2497 let results: Vec<Vec<u8>> = data
2498 .par_chunks(chunk_size)
2499 .map(|chunk| {
2500 let mut out = Vec::with_capacity(chunk.len());
2501 let mut cursor = 0;
2502 while cursor < chunk.len() {
2503 match memchr::memchr(ch, &chunk[cursor..]) {
2504 Some(offset) => {
2505 let pos = cursor + offset;
2506 if pos > cursor {
2507 out.extend_from_slice(&chunk[cursor..pos]);
2508 }
2509 out.push(ch);
2510 cursor = pos + 1;
2511 while cursor < chunk.len() && chunk[cursor] == ch {
2512 cursor += 1;
2513 }
2514 }
2515 None => {
2516 out.extend_from_slice(&chunk[cursor..]);
2517 break;
2518 }
2519 }
2520 }
2521 out
2522 })
2523 .collect();
2524
2525 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2528 for (idx, result) in results.iter().enumerate() {
2529 if result.is_empty() {
2530 continue;
2531 }
2532 if idx > 0 {
2533 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2534 if prev_last == ch {
2535 let skip = result.iter().take_while(|&&b| b == ch).count();
2537 if skip < result.len() {
2538 slices.push(std::io::IoSlice::new(&result[skip..]));
2539 }
2540 continue;
2541 }
2542 }
2543 }
2544 slices.push(std::io::IoSlice::new(result));
2545 }
2546 return write_ioslices(writer, &slices);
2547 }
2548
2549 let buf_size = data.len().min(BUF_SIZE);
2550 let mut outbuf = vec![0u8; buf_size];
2551 let len = data.len();
2552 let mut wp = 0;
2553 let mut cursor = 0;
2554
2555 while cursor < len {
2556 match memchr::memchr(ch, &data[cursor..]) {
2557 Some(offset) => {
2558 let pos = cursor + offset;
2559 let gap = pos - cursor;
2560 if gap > 0 {
2561 if wp + gap > buf_size {
2562 writer.write_all(&outbuf[..wp])?;
2563 wp = 0;
2564 }
2565 if gap > buf_size {
2566 writer.write_all(&data[cursor..pos])?;
2567 } else {
2568 outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
2569 wp += gap;
2570 }
2571 }
2572 if wp >= buf_size {
2573 writer.write_all(&outbuf[..wp])?;
2574 wp = 0;
2575 }
2576 outbuf[wp] = ch;
2577 wp += 1;
2578 cursor = pos + 1;
2579 while cursor < len && data[cursor] == ch {
2580 cursor += 1;
2581 }
2582 }
2583 None => {
2584 let remaining = len - cursor;
2585 if remaining > 0 {
2586 if wp + remaining > buf_size {
2587 writer.write_all(&outbuf[..wp])?;
2588 wp = 0;
2589 }
2590 if remaining > buf_size {
2591 writer.write_all(&data[cursor..])?;
2592 } else {
2593 outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
2594 wp += remaining;
2595 }
2596 }
2597 break;
2598 }
2599 }
2600 }
2601
2602 if wp > 0 {
2603 writer.write_all(&outbuf[..wp])?;
2604 }
2605 Ok(())
2606}