1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const MAX_IOV: usize = 1024;
8
9const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12const STREAM_BUF: usize = 8 * 1024 * 1024;
18
19const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
22
23#[inline]
26fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
27 if slices.is_empty() {
28 return Ok(());
29 }
30 for batch in slices.chunks(MAX_IOV) {
31 let total: usize = batch.iter().map(|s| s.len()).sum();
32 match writer.write_vectored(batch) {
33 Ok(n) if n >= total => continue,
34 Ok(mut written) => {
35 for slice in batch {
37 let slen = slice.len();
38 if written >= slen {
39 written -= slen;
40 continue;
41 }
42 if written > 0 {
43 writer.write_all(&slice[written..])?;
44 written = 0;
45 } else {
46 writer.write_all(slice)?;
47 }
48 }
49 }
50 Err(e) => return Err(e),
51 }
52 }
53 Ok(())
54}
55
56#[inline]
59#[allow(clippy::uninit_vec)]
60fn alloc_uninit_vec(len: usize) -> Vec<u8> {
61 let mut v = Vec::with_capacity(len);
62 unsafe {
64 v.set_len(len);
65 }
66 v
67}
68
69#[inline]
71fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
72 let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
73 let last = set2.last().copied();
74 for (i, &from) in set1.iter().enumerate() {
75 table[from as usize] = if i < set2.len() {
76 set2[i]
77 } else {
78 last.unwrap_or(from)
79 };
80 }
81 table
82}
83
84#[inline]
86fn build_member_set(chars: &[u8]) -> [u8; 32] {
87 let mut set = [0u8; 32];
88 for &ch in chars {
89 set[ch as usize >> 3] |= 1 << (ch & 7);
90 }
91 set
92}
93
94#[inline(always)]
95fn is_member(set: &[u8; 32], ch: u8) -> bool {
96 unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
97}
98
99#[inline(always)]
103fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
104 #[cfg(target_arch = "x86_64")]
105 {
106 if is_x86_feature_detected!("avx2") {
107 unsafe { translate_inplace_avx2_table(data, table) };
108 return;
109 }
110 if is_x86_feature_detected!("ssse3") {
111 unsafe { translate_inplace_ssse3_table(data, table) };
112 return;
113 }
114 }
115 translate_inplace_scalar(data, table);
116}
117
118#[inline(always)]
120fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
121 let len = data.len();
122 let ptr = data.as_mut_ptr();
123 let mut i = 0;
124 unsafe {
125 while i + 8 <= len {
126 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
127 *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
128 *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
129 *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
130 *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
131 *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
132 *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
133 *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
134 i += 8;
135 }
136 while i < len {
137 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
138 i += 1;
139 }
140 }
141}
142
143#[cfg(target_arch = "x86_64")]
163#[target_feature(enable = "avx2")]
164unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
165 use std::arch::x86_64::*;
166
167 unsafe {
168 let len = data.len();
169 let ptr = data.as_mut_ptr();
170
171 let mut lut = [_mm256_setzero_si256(); 16];
175 for h in 0u8..16 {
176 let base = (h as usize) * 16;
177 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
178 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
180 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
181 }
182
183 let lo_mask = _mm256_set1_epi8(0x0F);
184
185 let mut i = 0;
186 while i + 32 <= len {
187 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
188 let lo_nibble = _mm256_and_si256(input, lo_mask);
189 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
190
191 let mut result = _mm256_setzero_si256();
194
195 macro_rules! do_nibble {
197 ($h:expr) => {
198 let h_val = _mm256_set1_epi8($h as i8);
199 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
200 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
201 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
202 };
203 }
204 do_nibble!(0);
205 do_nibble!(1);
206 do_nibble!(2);
207 do_nibble!(3);
208 do_nibble!(4);
209 do_nibble!(5);
210 do_nibble!(6);
211 do_nibble!(7);
212 do_nibble!(8);
213 do_nibble!(9);
214 do_nibble!(10);
215 do_nibble!(11);
216 do_nibble!(12);
217 do_nibble!(13);
218 do_nibble!(14);
219 do_nibble!(15);
220
221 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
222 i += 32;
223 }
224
225 if i + 16 <= len {
227 let lo_mask128 = _mm_set1_epi8(0x0F);
228
229 let mut lut128 = [_mm_setzero_si128(); 16];
231 for h in 0u8..16 {
232 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
233 }
234
235 let input = _mm_loadu_si128(ptr.add(i) as *const _);
236 let lo_nib = _mm_and_si128(input, lo_mask128);
237 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
238
239 let mut res = _mm_setzero_si128();
240 macro_rules! do_nibble128 {
241 ($h:expr) => {
242 let h_val = _mm_set1_epi8($h as i8);
243 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
244 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
245 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
246 };
247 }
248 do_nibble128!(0);
249 do_nibble128!(1);
250 do_nibble128!(2);
251 do_nibble128!(3);
252 do_nibble128!(4);
253 do_nibble128!(5);
254 do_nibble128!(6);
255 do_nibble128!(7);
256 do_nibble128!(8);
257 do_nibble128!(9);
258 do_nibble128!(10);
259 do_nibble128!(11);
260 do_nibble128!(12);
261 do_nibble128!(13);
262 do_nibble128!(14);
263 do_nibble128!(15);
264
265 _mm_storeu_si128(ptr.add(i) as *mut _, res);
266 i += 16;
267 }
268
269 while i < len {
271 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
272 i += 1;
273 }
274 }
275}
276
277#[cfg(target_arch = "x86_64")]
278#[target_feature(enable = "ssse3")]
279unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
280 use std::arch::x86_64::*;
281
282 unsafe {
283 let len = data.len();
284 let ptr = data.as_mut_ptr();
285
286 let mut lut = [_mm_setzero_si128(); 16];
288 for h in 0u8..16 {
289 let base = (h as usize) * 16;
290 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
291 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
292 }
293
294 let lo_mask = _mm_set1_epi8(0x0F);
295
296 let mut i = 0;
297 while i + 16 <= len {
298 let input = _mm_loadu_si128(ptr.add(i) as *const _);
299 let lo_nibble = _mm_and_si128(input, lo_mask);
300 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
301
302 let mut result = _mm_setzero_si128();
303
304 macro_rules! do_nibble {
305 ($h:expr) => {
306 let h_val = _mm_set1_epi8($h as i8);
307 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
308 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
309 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
310 };
311 }
312 do_nibble!(0);
313 do_nibble!(1);
314 do_nibble!(2);
315 do_nibble!(3);
316 do_nibble!(4);
317 do_nibble!(5);
318 do_nibble!(6);
319 do_nibble!(7);
320 do_nibble!(8);
321 do_nibble!(9);
322 do_nibble!(10);
323 do_nibble!(11);
324 do_nibble!(12);
325 do_nibble!(13);
326 do_nibble!(14);
327 do_nibble!(15);
328
329 _mm_storeu_si128(ptr.add(i) as *mut _, result);
330 i += 16;
331 }
332
333 while i < len {
335 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
336 i += 1;
337 }
338 }
339}
340
341#[inline(always)]
344fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
345 debug_assert!(dst.len() >= src.len());
346 #[cfg(target_arch = "x86_64")]
347 {
348 if is_x86_feature_detected!("avx2") {
349 unsafe { translate_to_avx2_table(src, dst, table) };
350 return;
351 }
352 if is_x86_feature_detected!("ssse3") {
353 unsafe { translate_to_ssse3_table(src, dst, table) };
354 return;
355 }
356 }
357 translate_to_scalar(src, dst, table);
358}
359
360#[inline(always)]
362fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
363 unsafe {
364 let sp = src.as_ptr();
365 let dp = dst.as_mut_ptr();
366 let len = src.len();
367 let mut i = 0;
368 while i + 8 <= len {
369 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
370 *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
371 *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
372 *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
373 *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
374 *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
375 *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
376 *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
377 i += 8;
378 }
379 while i < len {
380 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
381 i += 1;
382 }
383 }
384}
385
386#[cfg(target_arch = "x86_64")]
387#[target_feature(enable = "avx2")]
388unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
389 use std::arch::x86_64::*;
390
391 unsafe {
392 let len = src.len();
393 let sp = src.as_ptr();
394 let dp = dst.as_mut_ptr();
395
396 let mut lut = [_mm256_setzero_si256(); 16];
398 for h in 0u8..16 {
399 let base = (h as usize) * 16;
400 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
401 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
402 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
403 }
404
405 let lo_mask = _mm256_set1_epi8(0x0F);
406
407 let mut i = 0;
408 while i + 32 <= len {
409 let input = _mm256_loadu_si256(sp.add(i) as *const _);
410 let lo_nibble = _mm256_and_si256(input, lo_mask);
411 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
412
413 let mut result = _mm256_setzero_si256();
414
415 macro_rules! do_nibble {
416 ($h:expr) => {
417 let h_val = _mm256_set1_epi8($h as i8);
418 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
419 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
420 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
421 };
422 }
423 do_nibble!(0);
424 do_nibble!(1);
425 do_nibble!(2);
426 do_nibble!(3);
427 do_nibble!(4);
428 do_nibble!(5);
429 do_nibble!(6);
430 do_nibble!(7);
431 do_nibble!(8);
432 do_nibble!(9);
433 do_nibble!(10);
434 do_nibble!(11);
435 do_nibble!(12);
436 do_nibble!(13);
437 do_nibble!(14);
438 do_nibble!(15);
439
440 _mm256_storeu_si256(dp.add(i) as *mut _, result);
441 i += 32;
442 }
443
444 if i + 16 <= len {
446 let lo_mask128 = _mm_set1_epi8(0x0F);
447 let mut lut128 = [_mm_setzero_si128(); 16];
448 for h in 0u8..16 {
449 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
450 }
451
452 let input = _mm_loadu_si128(sp.add(i) as *const _);
453 let lo_nib = _mm_and_si128(input, lo_mask128);
454 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
455
456 let mut res = _mm_setzero_si128();
457 macro_rules! do_nibble128 {
458 ($h:expr) => {
459 let h_val = _mm_set1_epi8($h as i8);
460 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
461 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
462 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
463 };
464 }
465 do_nibble128!(0);
466 do_nibble128!(1);
467 do_nibble128!(2);
468 do_nibble128!(3);
469 do_nibble128!(4);
470 do_nibble128!(5);
471 do_nibble128!(6);
472 do_nibble128!(7);
473 do_nibble128!(8);
474 do_nibble128!(9);
475 do_nibble128!(10);
476 do_nibble128!(11);
477 do_nibble128!(12);
478 do_nibble128!(13);
479 do_nibble128!(14);
480 do_nibble128!(15);
481
482 _mm_storeu_si128(dp.add(i) as *mut _, res);
483 i += 16;
484 }
485
486 while i < len {
488 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
489 i += 1;
490 }
491 }
492}
493
494#[cfg(target_arch = "x86_64")]
495#[target_feature(enable = "ssse3")]
496unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
497 use std::arch::x86_64::*;
498
499 unsafe {
500 let len = src.len();
501 let sp = src.as_ptr();
502 let dp = dst.as_mut_ptr();
503
504 let mut lut = [_mm_setzero_si128(); 16];
505 for h in 0u8..16 {
506 let base = (h as usize) * 16;
507 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
508 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
509 }
510
511 let lo_mask = _mm_set1_epi8(0x0F);
512
513 let mut i = 0;
514 while i + 16 <= len {
515 let input = _mm_loadu_si128(sp.add(i) as *const _);
516 let lo_nibble = _mm_and_si128(input, lo_mask);
517 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
518
519 let mut result = _mm_setzero_si128();
520
521 macro_rules! do_nibble {
522 ($h:expr) => {
523 let h_val = _mm_set1_epi8($h as i8);
524 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
525 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
526 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
527 };
528 }
529 do_nibble!(0);
530 do_nibble!(1);
531 do_nibble!(2);
532 do_nibble!(3);
533 do_nibble!(4);
534 do_nibble!(5);
535 do_nibble!(6);
536 do_nibble!(7);
537 do_nibble!(8);
538 do_nibble!(9);
539 do_nibble!(10);
540 do_nibble!(11);
541 do_nibble!(12);
542 do_nibble!(13);
543 do_nibble!(14);
544 do_nibble!(15);
545
546 _mm_storeu_si128(dp.add(i) as *mut _, result);
547 i += 16;
548 }
549
550 while i < len {
552 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
553 i += 1;
554 }
555 }
556}
557
558#[inline]
566fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
567 let mut lo: Option<u8> = None;
568 let mut hi = 0u8;
569 let mut offset = 0i16;
570
571 for i in 0..256 {
572 if table[i] != i as u8 {
573 let diff = table[i] as i16 - i as i16;
574 match lo {
575 None => {
576 lo = Some(i as u8);
577 hi = i as u8;
578 offset = diff;
579 }
580 Some(_) => {
581 if diff != offset || i as u8 != hi.wrapping_add(1) {
582 return None;
583 }
584 hi = i as u8;
585 }
586 }
587 }
588 }
589
590 lo.map(|l| (l, hi, offset as i8))
591}
592
593#[cfg(target_arch = "x86_64")]
597fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
598 if is_x86_feature_detected!("avx2") {
599 unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
600 } else {
601 unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
602 }
603}
604
605#[cfg(target_arch = "x86_64")]
606#[target_feature(enable = "avx2")]
607unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
608 use std::arch::x86_64::*;
609
610 unsafe {
611 let range = hi - lo;
612 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
617 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
618 let offset_v = _mm256_set1_epi8(offset);
619 let zero = _mm256_setzero_si256();
620
621 let len = src.len();
622 let mut i = 0;
623
624 while i + 32 <= len {
625 let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
626 let biased = _mm256_add_epi8(input, bias_v);
627 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
629 let mask = _mm256_cmpeq_epi8(gt, zero);
631 let offset_masked = _mm256_and_si256(mask, offset_v);
632 let result = _mm256_add_epi8(input, offset_masked);
633 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
634 i += 32;
635 }
636
637 if i + 16 <= len {
639 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
640 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
641 let offset_v128 = _mm_set1_epi8(offset);
642 let zero128 = _mm_setzero_si128();
643
644 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
645 let biased = _mm_add_epi8(input, bias_v128);
646 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
647 let mask = _mm_cmpeq_epi8(gt, zero128);
648 let offset_masked = _mm_and_si128(mask, offset_v128);
649 let result = _mm_add_epi8(input, offset_masked);
650 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
651 i += 16;
652 }
653
654 while i < len {
656 let b = *src.get_unchecked(i);
657 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
658 b.wrapping_add(offset as u8)
659 } else {
660 b
661 };
662 i += 1;
663 }
664 }
665}
666
667#[cfg(target_arch = "x86_64")]
668#[target_feature(enable = "sse2")]
669unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
670 use std::arch::x86_64::*;
671
672 unsafe {
673 let range = hi - lo;
674 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
675 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
676 let offset_v = _mm_set1_epi8(offset);
677 let zero = _mm_setzero_si128();
678
679 let len = src.len();
680 let mut i = 0;
681
682 while i + 16 <= len {
683 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
684 let biased = _mm_add_epi8(input, bias_v);
685 let gt = _mm_cmpgt_epi8(biased, threshold_v);
686 let mask = _mm_cmpeq_epi8(gt, zero);
687 let offset_masked = _mm_and_si128(mask, offset_v);
688 let result = _mm_add_epi8(input, offset_masked);
689 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
690 i += 16;
691 }
692
693 while i < len {
694 let b = *src.get_unchecked(i);
695 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
696 b.wrapping_add(offset as u8)
697 } else {
698 b
699 };
700 i += 1;
701 }
702 }
703}
704
705#[cfg(not(target_arch = "x86_64"))]
707fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
708 for (i, &b) in src.iter().enumerate() {
709 dst[i] = if b >= lo && b <= hi {
710 b.wrapping_add(offset as u8)
711 } else {
712 b
713 };
714 }
715}
716
717#[cfg(target_arch = "x86_64")]
725fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
726 if is_x86_feature_detected!("avx2") {
727 unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
728 } else {
729 unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
730 }
731}
732
733#[cfg(target_arch = "x86_64")]
734#[target_feature(enable = "avx2")]
735unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
736 use std::arch::x86_64::*;
737
738 unsafe {
739 let range = hi - lo;
740 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
741 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
742 let offset_v = _mm256_set1_epi8(offset);
743 let zero = _mm256_setzero_si256();
744
745 let len = data.len();
746 let ptr = data.as_mut_ptr();
747 let mut i = 0;
748
749 while i + 32 <= len {
750 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
751 let biased = _mm256_add_epi8(input, bias_v);
752 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
753 let mask = _mm256_cmpeq_epi8(gt, zero);
754 let offset_masked = _mm256_and_si256(mask, offset_v);
755 let result = _mm256_add_epi8(input, offset_masked);
756 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
757 i += 32;
758 }
759
760 if i + 16 <= len {
761 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
762 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
763 let offset_v128 = _mm_set1_epi8(offset);
764 let zero128 = _mm_setzero_si128();
765
766 let input = _mm_loadu_si128(ptr.add(i) as *const _);
767 let biased = _mm_add_epi8(input, bias_v128);
768 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
769 let mask = _mm_cmpeq_epi8(gt, zero128);
770 let offset_masked = _mm_and_si128(mask, offset_v128);
771 let result = _mm_add_epi8(input, offset_masked);
772 _mm_storeu_si128(ptr.add(i) as *mut _, result);
773 i += 16;
774 }
775
776 while i < len {
777 let b = *ptr.add(i);
778 *ptr.add(i) = if b >= lo && b <= hi {
779 b.wrapping_add(offset as u8)
780 } else {
781 b
782 };
783 i += 1;
784 }
785 }
786}
787
788#[cfg(target_arch = "x86_64")]
789#[target_feature(enable = "sse2")]
790unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
791 use std::arch::x86_64::*;
792
793 unsafe {
794 let range = hi - lo;
795 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
796 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
797 let offset_v = _mm_set1_epi8(offset);
798 let zero = _mm_setzero_si128();
799
800 let len = data.len();
801 let ptr = data.as_mut_ptr();
802 let mut i = 0;
803
804 while i + 16 <= len {
805 let input = _mm_loadu_si128(ptr.add(i) as *const _);
806 let biased = _mm_add_epi8(input, bias_v);
807 let gt = _mm_cmpgt_epi8(biased, threshold_v);
808 let mask = _mm_cmpeq_epi8(gt, zero);
809 let offset_masked = _mm_and_si128(mask, offset_v);
810 let result = _mm_add_epi8(input, offset_masked);
811 _mm_storeu_si128(ptr.add(i) as *mut _, result);
812 i += 16;
813 }
814
815 while i < len {
816 let b = *ptr.add(i);
817 *ptr.add(i) = if b >= lo && b <= hi {
818 b.wrapping_add(offset as u8)
819 } else {
820 b
821 };
822 i += 1;
823 }
824 }
825}
826
827#[cfg(not(target_arch = "x86_64"))]
828fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
829 for b in data.iter_mut() {
830 if *b >= lo && *b <= hi {
831 *b = b.wrapping_add(offset as u8);
832 }
833 }
834}
835
836#[inline]
846fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
847 if chars.is_empty() {
848 return None;
849 }
850 let mut lo = chars[0];
851 let mut hi = chars[0];
852 for &c in &chars[1..] {
853 if c < lo {
854 lo = c;
855 }
856 if c > hi {
857 hi = c;
858 }
859 }
860 if (hi - lo + 1) as usize == chars.len() {
862 Some((lo, hi))
863 } else {
864 None
865 }
866}
867
868#[cfg(target_arch = "x86_64")]
872fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
873 if is_x86_feature_detected!("avx2") {
874 unsafe { delete_range_avx2(src, dst, lo, hi) }
875 } else {
876 unsafe { delete_range_sse2(src, dst, lo, hi) }
877 }
878}
879
880#[cfg(target_arch = "x86_64")]
881#[target_feature(enable = "avx2")]
882unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
883 use std::arch::x86_64::*;
884
885 unsafe {
886 let range = hi - lo;
887 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
888 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
889 let zero = _mm256_setzero_si256();
890
891 let len = src.len();
892 let sp = src.as_ptr();
893 let dp = dst.as_mut_ptr();
894 let mut ri = 0;
895 let mut wp = 0;
896
897 while ri + 32 <= len {
898 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
899 let biased = _mm256_add_epi8(input, bias_v);
900 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
902 let in_range = _mm256_cmpeq_epi8(gt, zero);
904 let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
906
907 if keep_mask == 0xFFFFFFFF {
908 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
910 wp += 32;
911 } else if keep_mask != 0 {
912 compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
914 let c0 = (keep_mask as u8).count_ones() as usize;
915 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
916 let c1 = ((keep_mask >> 8) as u8).count_ones() as usize;
917 compact_8bytes(
918 sp.add(ri + 16),
919 dp.add(wp + c0 + c1),
920 (keep_mask >> 16) as u8,
921 );
922 let c2 = ((keep_mask >> 16) as u8).count_ones() as usize;
923 compact_8bytes(
924 sp.add(ri + 24),
925 dp.add(wp + c0 + c1 + c2),
926 (keep_mask >> 24) as u8,
927 );
928 let c3 = ((keep_mask >> 24) as u8).count_ones() as usize;
929 wp += c0 + c1 + c2 + c3;
930 }
931 ri += 32;
933 }
934
935 if ri + 16 <= len {
937 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
938 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
939 let zero128 = _mm_setzero_si128();
940
941 let input = _mm_loadu_si128(sp.add(ri) as *const _);
942 let biased = _mm_add_epi8(input, bias_v128);
943 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
944 let in_range = _mm_cmpeq_epi8(gt, zero128);
945 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
946
947 if keep_mask == 0xFFFF {
948 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
949 wp += 16;
950 } else if keep_mask != 0 {
951 compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
952 let c0 = (keep_mask as u8).count_ones() as usize;
953 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
954 wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
955 }
956 ri += 16;
957 }
958
959 while ri < len {
961 let b = *sp.add(ri);
962 if b < lo || b > hi {
963 *dp.add(wp) = b;
964 wp += 1;
965 }
966 ri += 1;
967 }
968
969 wp
970 }
971}
972
973#[cfg(target_arch = "x86_64")]
978#[inline(always)]
979unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
980 unsafe {
985 let mut m = mask;
986 let mut w = 0;
987 while m != 0 {
988 let bit = m.trailing_zeros() as usize;
989 *dst.add(w) = *src.add(bit);
990 w += 1;
991 m &= m - 1;
992 }
993 }
994}
995
996#[cfg(target_arch = "x86_64")]
997#[target_feature(enable = "sse2")]
998unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
999 use std::arch::x86_64::*;
1000
1001 unsafe {
1002 let range = hi - lo;
1003 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1004 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1005 let zero = _mm_setzero_si128();
1006
1007 let len = src.len();
1008 let sp = src.as_ptr();
1009 let dp = dst.as_mut_ptr();
1010 let mut ri = 0;
1011 let mut wp = 0;
1012
1013 while ri + 16 <= len {
1014 let input = _mm_loadu_si128(sp.add(ri) as *const _);
1015 let biased = _mm_add_epi8(input, bias_v);
1016 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1017 let in_range = _mm_cmpeq_epi8(gt, zero);
1018 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
1019
1020 if keep_mask == 0xFFFF {
1021 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
1023 wp += 16;
1024 } else if keep_mask != 0 {
1025 compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
1026 let c0 = (keep_mask as u8).count_ones() as usize;
1027 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
1028 wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
1029 }
1030 ri += 16;
1031 }
1032
1033 while ri < len {
1034 let b = *sp.add(ri);
1035 if b < lo || b > hi {
1036 *dp.add(wp) = b;
1037 wp += 1;
1038 }
1039 ri += 1;
1040 }
1041
1042 wp
1043 }
1044}
1045
1046#[cfg(not(target_arch = "x86_64"))]
1048fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1049 let mut wp = 0;
1050 for &b in src {
1051 if b < lo || b > hi {
1052 dst[wp] = b;
1053 wp += 1;
1054 }
1055 }
1056 wp
1057}
1058
1059fn delete_range_streaming(
1064 lo: u8,
1065 hi: u8,
1066 reader: &mut impl Read,
1067 writer: &mut impl Write,
1068) -> io::Result<()> {
1069 let mut src = vec![0u8; STREAM_BUF];
1070 let mut dst = alloc_uninit_vec(STREAM_BUF);
1071 loop {
1072 let n = read_full(reader, &mut src)?;
1073 if n == 0 {
1074 break;
1075 }
1076 let wp = delete_range_chunk(&src[..n], &mut dst, lo, hi);
1077 if wp == n {
1078 writer.write_all(&src[..n])?;
1080 } else if wp > 0 {
1081 writer.write_all(&dst[..wp])?;
1082 }
1083 }
1084 Ok(())
1085}
1086
1087pub fn translate(
1092 set1: &[u8],
1093 set2: &[u8],
1094 reader: &mut impl Read,
1095 writer: &mut impl Write,
1096) -> io::Result<()> {
1097 let table = build_translate_table(set1, set2);
1098
1099 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1101 if is_identity {
1102 return passthrough_stream(reader, writer);
1103 }
1104
1105 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1107 return translate_range_stream(lo, hi, offset, reader, writer);
1108 }
1109
1110 let mut buf = vec![0u8; STREAM_BUF];
1118 loop {
1119 let n = read_full(reader, &mut buf)?;
1120 if n == 0 {
1121 break;
1122 }
1123 translate_inplace(&mut buf[..n], &table);
1124 writer.write_all(&buf[..n])?;
1125 }
1126 Ok(())
1127}
1128
1129fn translate_range_stream(
1132 lo: u8,
1133 hi: u8,
1134 offset: i8,
1135 reader: &mut impl Read,
1136 writer: &mut impl Write,
1137) -> io::Result<()> {
1138 let mut buf = vec![0u8; STREAM_BUF];
1139 loop {
1140 let n = read_full(reader, &mut buf)?;
1141 if n == 0 {
1142 break;
1143 }
1144 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1145 writer.write_all(&buf[..n])?;
1146 }
1147 Ok(())
1148}
1149
1150fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1153 let mut buf = vec![0u8; STREAM_BUF];
1154 loop {
1155 let n = read_full(reader, &mut buf)?;
1156 if n == 0 {
1157 break;
1158 }
1159 writer.write_all(&buf[..n])?;
1160 }
1161 Ok(())
1162}
1163
1164#[inline]
1167fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
1168 let n = reader.read(buf)?;
1170 if n == buf.len() || n == 0 {
1171 return Ok(n);
1172 }
1173 let mut total = n;
1175 while total < buf.len() {
1176 match reader.read(&mut buf[total..]) {
1177 Ok(0) => break,
1178 Ok(n) => total += n,
1179 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1180 Err(e) => return Err(e),
1181 }
1182 }
1183 Ok(total)
1184}
1185
1186pub fn translate_squeeze(
1187 set1: &[u8],
1188 set2: &[u8],
1189 reader: &mut impl Read,
1190 writer: &mut impl Write,
1191) -> io::Result<()> {
1192 let table = build_translate_table(set1, set2);
1193 let squeeze_set = build_member_set(set2);
1194
1195 let range_info = detect_range_offset(&table);
1201
1202 let mut buf = vec![0u8; STREAM_BUF];
1203 let mut last_squeezed: u16 = 256;
1204
1205 loop {
1206 let n = read_full(reader, &mut buf)?;
1207 if n == 0 {
1208 break;
1209 }
1210 if let Some((lo, hi, offset)) = range_info {
1212 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1213 } else {
1214 translate_inplace(&mut buf[..n], &table);
1215 }
1216 let mut wp = 0;
1218 unsafe {
1219 let ptr = buf.as_mut_ptr();
1220 for i in 0..n {
1221 let b = *ptr.add(i);
1222 if is_member(&squeeze_set, b) {
1223 if last_squeezed == b as u16 {
1224 continue;
1225 }
1226 last_squeezed = b as u16;
1227 } else {
1228 last_squeezed = 256;
1229 }
1230 *ptr.add(wp) = b;
1231 wp += 1;
1232 }
1233 }
1234 writer.write_all(&buf[..wp])?;
1235 }
1236 Ok(())
1237}
1238
1239pub fn delete(
1240 delete_chars: &[u8],
1241 reader: &mut impl Read,
1242 writer: &mut impl Write,
1243) -> io::Result<()> {
1244 if delete_chars.len() == 1 {
1245 return delete_single_streaming(delete_chars[0], reader, writer);
1246 }
1247 if delete_chars.len() <= 3 {
1248 return delete_multi_streaming(delete_chars, reader, writer);
1249 }
1250
1251 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1255 return delete_range_streaming(lo, hi, reader, writer);
1256 }
1257
1258 let member = build_member_set(delete_chars);
1259 let mut buf = vec![0u8; STREAM_BUF];
1260
1261 loop {
1262 let n = read_full(reader, &mut buf)?;
1263 if n == 0 {
1264 break;
1265 }
1266 let mut wp = 0;
1267 unsafe {
1268 let ptr = buf.as_mut_ptr();
1269 let mut i = 0;
1270 while i + 8 <= n {
1271 let b0 = *ptr.add(i);
1272 let b1 = *ptr.add(i + 1);
1273 let b2 = *ptr.add(i + 2);
1274 let b3 = *ptr.add(i + 3);
1275 let b4 = *ptr.add(i + 4);
1276 let b5 = *ptr.add(i + 5);
1277 let b6 = *ptr.add(i + 6);
1278 let b7 = *ptr.add(i + 7);
1279
1280 *ptr.add(wp) = b0;
1283 wp += !is_member(&member, b0) as usize;
1284 *ptr.add(wp) = b1;
1285 wp += !is_member(&member, b1) as usize;
1286 *ptr.add(wp) = b2;
1287 wp += !is_member(&member, b2) as usize;
1288 *ptr.add(wp) = b3;
1289 wp += !is_member(&member, b3) as usize;
1290 *ptr.add(wp) = b4;
1291 wp += !is_member(&member, b4) as usize;
1292 *ptr.add(wp) = b5;
1293 wp += !is_member(&member, b5) as usize;
1294 *ptr.add(wp) = b6;
1295 wp += !is_member(&member, b6) as usize;
1296 *ptr.add(wp) = b7;
1297 wp += !is_member(&member, b7) as usize;
1298 i += 8;
1299 }
1300 while i < n {
1301 let b = *ptr.add(i);
1302 *ptr.add(wp) = b;
1303 wp += !is_member(&member, b) as usize;
1304 i += 1;
1305 }
1306 }
1307 writer.write_all(&buf[..wp])?;
1308 }
1309 Ok(())
1310}
1311
1312fn delete_single_streaming(
1313 ch: u8,
1314 reader: &mut impl Read,
1315 writer: &mut impl Write,
1316) -> io::Result<()> {
1317 let mut src = vec![0u8; STREAM_BUF];
1318 let mut dst = alloc_uninit_vec(STREAM_BUF);
1319 loop {
1320 let n = read_full(reader, &mut src)?;
1321 if n == 0 {
1322 break;
1323 }
1324 let mut wp = 0;
1328 let mut i = 0;
1329 while i < n {
1330 match memchr::memchr(ch, &src[i..n]) {
1331 Some(offset) => {
1332 if offset > 0 {
1333 unsafe {
1334 std::ptr::copy_nonoverlapping(
1335 src.as_ptr().add(i),
1336 dst.as_mut_ptr().add(wp),
1337 offset,
1338 );
1339 }
1340 wp += offset;
1341 }
1342 i += offset + 1;
1343 }
1344 None => {
1345 let run_len = n - i;
1346 if run_len > 0 {
1347 unsafe {
1348 std::ptr::copy_nonoverlapping(
1349 src.as_ptr().add(i),
1350 dst.as_mut_ptr().add(wp),
1351 run_len,
1352 );
1353 }
1354 wp += run_len;
1355 }
1356 break;
1357 }
1358 }
1359 }
1360 if wp == n {
1362 writer.write_all(&src[..n])?;
1363 } else if wp > 0 {
1364 writer.write_all(&dst[..wp])?;
1365 }
1366 }
1367 Ok(())
1368}
1369
1370fn delete_multi_streaming(
1371 chars: &[u8],
1372 reader: &mut impl Read,
1373 writer: &mut impl Write,
1374) -> io::Result<()> {
1375 let mut src = vec![0u8; STREAM_BUF];
1376 let mut dst = alloc_uninit_vec(STREAM_BUF);
1377 loop {
1378 let n = read_full(reader, &mut src)?;
1379 if n == 0 {
1380 break;
1381 }
1382 let mut wp = 0;
1385 let mut i = 0;
1386 while i < n {
1387 let found = if chars.len() == 2 {
1388 memchr::memchr2(chars[0], chars[1], &src[i..n])
1389 } else {
1390 memchr::memchr3(chars[0], chars[1], chars[2], &src[i..n])
1391 };
1392 match found {
1393 Some(offset) => {
1394 if offset > 0 {
1395 unsafe {
1396 std::ptr::copy_nonoverlapping(
1397 src.as_ptr().add(i),
1398 dst.as_mut_ptr().add(wp),
1399 offset,
1400 );
1401 }
1402 wp += offset;
1403 }
1404 i += offset + 1;
1405 }
1406 None => {
1407 let run_len = n - i;
1408 if run_len > 0 {
1409 unsafe {
1410 std::ptr::copy_nonoverlapping(
1411 src.as_ptr().add(i),
1412 dst.as_mut_ptr().add(wp),
1413 run_len,
1414 );
1415 }
1416 wp += run_len;
1417 }
1418 break;
1419 }
1420 }
1421 }
1422 if wp == n {
1423 writer.write_all(&src[..n])?;
1424 } else if wp > 0 {
1425 writer.write_all(&dst[..wp])?;
1426 }
1427 }
1428 Ok(())
1429}
1430
1431pub fn delete_squeeze(
1432 delete_chars: &[u8],
1433 squeeze_chars: &[u8],
1434 reader: &mut impl Read,
1435 writer: &mut impl Write,
1436) -> io::Result<()> {
1437 let delete_set = build_member_set(delete_chars);
1438 let squeeze_set = build_member_set(squeeze_chars);
1439 let mut buf = vec![0u8; STREAM_BUF];
1440 let mut last_squeezed: u16 = 256;
1441
1442 loop {
1443 let n = read_full(reader, &mut buf)?;
1444 if n == 0 {
1445 break;
1446 }
1447 let mut wp = 0;
1448 unsafe {
1449 let ptr = buf.as_mut_ptr();
1450 for i in 0..n {
1451 let b = *ptr.add(i);
1452 if is_member(&delete_set, b) {
1453 continue;
1454 }
1455 if is_member(&squeeze_set, b) {
1456 if last_squeezed == b as u16 {
1457 continue;
1458 }
1459 last_squeezed = b as u16;
1460 } else {
1461 last_squeezed = 256;
1462 }
1463 *ptr.add(wp) = b;
1464 wp += 1;
1465 }
1466 }
1467 writer.write_all(&buf[..wp])?;
1468 }
1469 Ok(())
1470}
1471
1472pub fn squeeze(
1473 squeeze_chars: &[u8],
1474 reader: &mut impl Read,
1475 writer: &mut impl Write,
1476) -> io::Result<()> {
1477 if squeeze_chars.len() == 1 {
1478 return squeeze_single_stream(squeeze_chars[0], reader, writer);
1479 }
1480
1481 let member = build_member_set(squeeze_chars);
1482 let mut buf = vec![0u8; STREAM_BUF];
1483 let mut last_squeezed: u16 = 256;
1484
1485 loop {
1486 let n = read_full(reader, &mut buf)?;
1487 if n == 0 {
1488 break;
1489 }
1490 let mut wp = 0;
1491 unsafe {
1492 let ptr = buf.as_mut_ptr();
1493 for i in 0..n {
1494 let b = *ptr.add(i);
1495 if is_member(&member, b) {
1496 if last_squeezed == b as u16 {
1497 continue;
1498 }
1499 last_squeezed = b as u16;
1500 } else {
1501 last_squeezed = 256;
1502 }
1503 *ptr.add(wp) = b;
1504 wp += 1;
1505 }
1506 }
1507 writer.write_all(&buf[..wp])?;
1508 }
1509 Ok(())
1510}
1511
1512fn squeeze_single_stream(
1513 ch: u8,
1514 reader: &mut impl Read,
1515 writer: &mut impl Write,
1516) -> io::Result<()> {
1517 let pair = [ch, ch];
1522 let finder = memchr::memmem::Finder::new(&pair);
1523 let mut buf = vec![0u8; STREAM_BUF];
1524 let mut was_squeeze_char = false;
1525
1526 loop {
1527 let n = read_full(reader, &mut buf)?;
1528 if n == 0 {
1529 break;
1530 }
1531
1532 let ptr = buf.as_mut_ptr();
1533 let mut wp = 0;
1534 let mut i = 0;
1535
1536 if was_squeeze_char {
1539 while i < n && unsafe { *ptr.add(i) } == ch {
1540 i += 1;
1541 }
1542 if i >= n {
1543 continue;
1546 }
1547 }
1548
1549 loop {
1552 match finder.find(&buf[i..n]) {
1553 Some(offset) => {
1554 let copy_len = offset + 1; if copy_len > 0 && wp != i {
1557 unsafe {
1558 std::ptr::copy(ptr.add(i), ptr.add(wp), copy_len);
1559 }
1560 }
1561 wp += copy_len;
1562 i += offset + 1; while i < n && unsafe { *ptr.add(i) } == ch {
1565 i += 1;
1566 }
1567 if i >= n {
1568 was_squeeze_char = true;
1569 break;
1570 }
1571 }
1572 None => {
1573 let run_len = n - i;
1575 if run_len > 0 && wp != i {
1576 unsafe {
1577 std::ptr::copy(ptr.add(i), ptr.add(wp), run_len);
1578 }
1579 }
1580 wp += run_len;
1581 was_squeeze_char = n > 0 && unsafe { *ptr.add(n - 1) } == ch;
1583 break;
1584 }
1585 }
1586 }
1587
1588 writer.write_all(&buf[..wp])?;
1589 }
1590 Ok(())
1591}
1592
1593const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
1601
1602pub fn translate_mmap(
1611 set1: &[u8],
1612 set2: &[u8],
1613 data: &[u8],
1614 writer: &mut impl Write,
1615) -> io::Result<()> {
1616 let table = build_translate_table(set1, set2);
1617
1618 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1620 if is_identity {
1621 return writer.write_all(data);
1622 }
1623
1624 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1626 return translate_mmap_range(data, writer, lo, hi, offset);
1627 }
1628
1629 translate_mmap_table(data, writer, &table)
1631}
1632
1633fn translate_mmap_range(
1635 data: &[u8],
1636 writer: &mut impl Write,
1637 lo: u8,
1638 hi: u8,
1639 offset: i8,
1640) -> io::Result<()> {
1641 if data.len() >= PARALLEL_THRESHOLD {
1643 let mut buf = alloc_uninit_vec(data.len());
1644 let n_threads = rayon::current_num_threads().max(1);
1645 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1646
1647 data.par_chunks(chunk_size)
1649 .zip(buf.par_chunks_mut(chunk_size))
1650 .for_each(|(src_chunk, dst_chunk)| {
1651 translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
1652 });
1653
1654 return writer.write_all(&buf);
1655 }
1656
1657 if data.len() <= SINGLE_WRITE_LIMIT {
1659 let mut buf = alloc_uninit_vec(data.len());
1660 translate_range_simd(data, &mut buf, lo, hi, offset);
1661 return writer.write_all(&buf);
1662 }
1663 let mut buf = alloc_uninit_vec(BUF_SIZE);
1665 for chunk in data.chunks(BUF_SIZE) {
1666 translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
1667 writer.write_all(&buf[..chunk.len()])?;
1668 }
1669 Ok(())
1670}
1671
1672fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
1674 if data.len() >= PARALLEL_THRESHOLD {
1676 let mut buf = alloc_uninit_vec(data.len());
1677 let n_threads = rayon::current_num_threads().max(1);
1678 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1679
1680 data.par_chunks(chunk_size)
1681 .zip(buf.par_chunks_mut(chunk_size))
1682 .for_each(|(src_chunk, dst_chunk)| {
1683 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
1684 });
1685
1686 return writer.write_all(&buf);
1687 }
1688
1689 if data.len() <= SINGLE_WRITE_LIMIT {
1691 let mut buf = alloc_uninit_vec(data.len());
1692 translate_to(data, &mut buf, table);
1693 return writer.write_all(&buf);
1694 }
1695 let mut buf = alloc_uninit_vec(BUF_SIZE);
1696 for chunk in data.chunks(BUF_SIZE) {
1697 translate_to(chunk, &mut buf[..chunk.len()], table);
1698 writer.write_all(&buf[..chunk.len()])?;
1699 }
1700 Ok(())
1701}
1702
1703pub fn translate_squeeze_mmap(
1709 set1: &[u8],
1710 set2: &[u8],
1711 data: &[u8],
1712 writer: &mut impl Write,
1713) -> io::Result<()> {
1714 let table = build_translate_table(set1, set2);
1715 let squeeze_set = build_member_set(set2);
1716
1717 if data.len() >= PARALLEL_THRESHOLD {
1722 let mut translated = alloc_uninit_vec(data.len());
1724 let range_info = detect_range_offset(&table);
1725 let n_threads = rayon::current_num_threads().max(1);
1726 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1727
1728 if let Some((lo, hi, offset)) = range_info {
1729 data.par_chunks(chunk_size)
1730 .zip(translated.par_chunks_mut(chunk_size))
1731 .for_each(|(src_chunk, dst_chunk)| {
1732 translate_range_simd(
1733 src_chunk,
1734 &mut dst_chunk[..src_chunk.len()],
1735 lo,
1736 hi,
1737 offset,
1738 );
1739 });
1740 } else {
1741 data.par_chunks(chunk_size)
1742 .zip(translated.par_chunks_mut(chunk_size))
1743 .for_each(|(src_chunk, dst_chunk)| {
1744 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
1745 });
1746 }
1747
1748 let mut last_squeezed: u16 = 256;
1752 let len = translated.len();
1753 let mut wp = 0;
1754 unsafe {
1755 let ptr = translated.as_mut_ptr();
1756 let mut i = 0;
1757 while i < len {
1758 let b = *ptr.add(i);
1759 if is_member(&squeeze_set, b) {
1760 if last_squeezed == b as u16 {
1761 i += 1;
1762 continue;
1763 }
1764 last_squeezed = b as u16;
1765 } else {
1766 last_squeezed = 256;
1767 }
1768 *ptr.add(wp) = b;
1769 wp += 1;
1770 i += 1;
1771 }
1772 }
1773 return writer.write_all(&translated[..wp]);
1774 }
1775
1776 if data.len() <= SINGLE_WRITE_LIMIT {
1778 let mut buf: Vec<u8> = Vec::with_capacity(data.len());
1779 let mut last_squeezed: u16 = 256;
1780 unsafe {
1781 buf.set_len(data.len());
1782 let outp: *mut u8 = buf.as_mut_ptr();
1783 let inp = data.as_ptr();
1784 let len = data.len();
1785 let mut wp = 0;
1786 let mut i = 0;
1787 while i < len {
1788 let translated = *table.get_unchecked(*inp.add(i) as usize);
1789 if is_member(&squeeze_set, translated) {
1790 if last_squeezed == translated as u16 {
1791 i += 1;
1792 continue;
1793 }
1794 last_squeezed = translated as u16;
1795 } else {
1796 last_squeezed = 256;
1797 }
1798 *outp.add(wp) = translated;
1799 wp += 1;
1800 i += 1;
1801 }
1802 buf.set_len(wp);
1803 }
1804 return writer.write_all(&buf);
1805 }
1806
1807 let buf_size = data.len().min(BUF_SIZE);
1809 let mut buf = vec![0u8; buf_size];
1810 let mut last_squeezed: u16 = 256;
1811
1812 for chunk in data.chunks(buf_size) {
1813 translate_to(chunk, &mut buf[..chunk.len()], &table);
1814 let mut wp = 0;
1815 unsafe {
1816 let ptr = buf.as_mut_ptr();
1817 for i in 0..chunk.len() {
1818 let b = *ptr.add(i);
1819 if is_member(&squeeze_set, b) {
1820 if last_squeezed == b as u16 {
1821 continue;
1822 }
1823 last_squeezed = b as u16;
1824 } else {
1825 last_squeezed = 256;
1826 }
1827 *ptr.add(wp) = b;
1828 wp += 1;
1829 }
1830 }
1831 writer.write_all(&buf[..wp])?;
1832 }
1833 Ok(())
1834}
1835
1836pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1842 if delete_chars.len() == 1 {
1843 return delete_single_char_mmap(delete_chars[0], data, writer);
1844 }
1845 if delete_chars.len() <= 3 {
1846 return delete_multi_memchr_mmap(delete_chars, data, writer);
1847 }
1848
1849 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1851 return delete_range_mmap(data, writer, lo, hi);
1852 }
1853
1854 let member = build_member_set(delete_chars);
1855
1856 if data.len() >= PARALLEL_THRESHOLD {
1860 let n_threads = rayon::current_num_threads().max(1);
1861 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1862
1863 let mut outbuf = alloc_uninit_vec(data.len());
1865 let chunk_lens: Vec<usize> = data
1866 .par_chunks(chunk_size)
1867 .zip(outbuf.par_chunks_mut(chunk_size))
1868 .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
1869 .collect();
1870
1871 let mut write_pos = 0;
1875 let mut src_offset = 0;
1876 for &clen in &chunk_lens {
1877 if clen > 0 && src_offset != write_pos {
1878 unsafe {
1879 std::ptr::copy(
1880 outbuf.as_ptr().add(src_offset),
1881 outbuf.as_mut_ptr().add(write_pos),
1882 clen,
1883 );
1884 }
1885 }
1886 write_pos += clen;
1887 src_offset += chunk_size;
1888 }
1889
1890 return writer.write_all(&outbuf[..write_pos]);
1891 }
1892
1893 if data.len() <= SINGLE_WRITE_LIMIT {
1895 let mut outbuf = alloc_uninit_vec(data.len());
1896 let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
1897 return writer.write_all(&outbuf[..out_pos]);
1898 }
1899
1900 let buf_size = data.len().min(BUF_SIZE);
1902 let mut outbuf = alloc_uninit_vec(buf_size);
1903
1904 for chunk in data.chunks(buf_size) {
1905 let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
1906 writer.write_all(&outbuf[..out_pos])?;
1907 }
1908 Ok(())
1909}
1910
1911fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
1913 if data.len() >= PARALLEL_THRESHOLD {
1915 let n_threads = rayon::current_num_threads().max(1);
1916 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1917
1918 let results: Vec<Vec<u8>> = data
1919 .par_chunks(chunk_size)
1920 .map(|chunk| {
1921 let mut out = alloc_uninit_vec(chunk.len());
1922 let wp = delete_range_chunk(chunk, &mut out, lo, hi);
1923 unsafe { out.set_len(wp) };
1924 out
1925 })
1926 .collect();
1927
1928 let slices: Vec<std::io::IoSlice> = results
1929 .iter()
1930 .filter(|r| !r.is_empty())
1931 .map(|r| std::io::IoSlice::new(r))
1932 .collect();
1933 return write_ioslices(writer, &slices);
1934 }
1935
1936 if data.len() <= SINGLE_WRITE_LIMIT {
1938 let mut outbuf = alloc_uninit_vec(data.len());
1939 let wp = delete_range_chunk(data, &mut outbuf, lo, hi);
1940 return writer.write_all(&outbuf[..wp]);
1941 }
1942
1943 let mut outbuf = alloc_uninit_vec(BUF_SIZE);
1945 for chunk in data.chunks(BUF_SIZE) {
1946 let wp = delete_range_chunk(chunk, &mut outbuf, lo, hi);
1947 writer.write_all(&outbuf[..wp])?;
1948 }
1949 Ok(())
1950}
1951
1952#[inline]
1955fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
1956 let len = chunk.len();
1957 let mut out_pos = 0;
1958 let mut i = 0;
1959
1960 while i + 8 <= len {
1961 unsafe {
1962 let b0 = *chunk.get_unchecked(i);
1963 let b1 = *chunk.get_unchecked(i + 1);
1964 let b2 = *chunk.get_unchecked(i + 2);
1965 let b3 = *chunk.get_unchecked(i + 3);
1966 let b4 = *chunk.get_unchecked(i + 4);
1967 let b5 = *chunk.get_unchecked(i + 5);
1968 let b6 = *chunk.get_unchecked(i + 6);
1969 let b7 = *chunk.get_unchecked(i + 7);
1970
1971 *outbuf.get_unchecked_mut(out_pos) = b0;
1972 out_pos += !is_member(member, b0) as usize;
1973 *outbuf.get_unchecked_mut(out_pos) = b1;
1974 out_pos += !is_member(member, b1) as usize;
1975 *outbuf.get_unchecked_mut(out_pos) = b2;
1976 out_pos += !is_member(member, b2) as usize;
1977 *outbuf.get_unchecked_mut(out_pos) = b3;
1978 out_pos += !is_member(member, b3) as usize;
1979 *outbuf.get_unchecked_mut(out_pos) = b4;
1980 out_pos += !is_member(member, b4) as usize;
1981 *outbuf.get_unchecked_mut(out_pos) = b5;
1982 out_pos += !is_member(member, b5) as usize;
1983 *outbuf.get_unchecked_mut(out_pos) = b6;
1984 out_pos += !is_member(member, b6) as usize;
1985 *outbuf.get_unchecked_mut(out_pos) = b7;
1986 out_pos += !is_member(member, b7) as usize;
1987 }
1988 i += 8;
1989 }
1990
1991 while i < len {
1992 unsafe {
1993 let b = *chunk.get_unchecked(i);
1994 *outbuf.get_unchecked_mut(out_pos) = b;
1995 out_pos += !is_member(member, b) as usize;
1996 }
1997 i += 1;
1998 }
1999
2000 out_pos
2001}
2002
2003fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2004 if data.len() >= PARALLEL_THRESHOLD {
2007 let n_threads = rayon::current_num_threads().max(1);
2008 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2009
2010 let results: Vec<Vec<u8>> = data
2011 .par_chunks(chunk_size)
2012 .map(|chunk| {
2013 let mut out = Vec::with_capacity(chunk.len());
2014 let mut last = 0;
2015 for pos in memchr::memchr_iter(ch, chunk) {
2016 if pos > last {
2017 out.extend_from_slice(&chunk[last..pos]);
2018 }
2019 last = pos + 1;
2020 }
2021 if last < chunk.len() {
2022 out.extend_from_slice(&chunk[last..]);
2023 }
2024 out
2025 })
2026 .collect();
2027
2028 let slices: Vec<std::io::IoSlice> = results
2030 .iter()
2031 .filter(|r| !r.is_empty())
2032 .map(|r| std::io::IoSlice::new(r))
2033 .collect();
2034 return write_ioslices(writer, &slices);
2035 }
2036
2037 if data.len() <= SINGLE_WRITE_LIMIT {
2039 let mut outbuf = Vec::with_capacity(data.len());
2040 let mut last = 0;
2041 for pos in memchr::memchr_iter(ch, data) {
2042 if pos > last {
2043 outbuf.extend_from_slice(&data[last..pos]);
2044 }
2045 last = pos + 1;
2046 }
2047 if last < data.len() {
2048 outbuf.extend_from_slice(&data[last..]);
2049 }
2050 return writer.write_all(&outbuf);
2051 }
2052
2053 let buf_size = data.len().min(BUF_SIZE);
2055 let mut outbuf = vec![0u8; buf_size];
2056
2057 for chunk in data.chunks(buf_size) {
2058 let mut wp = 0;
2059 let mut last = 0;
2060 for pos in memchr::memchr_iter(ch, chunk) {
2061 if pos > last {
2062 let run = pos - last;
2063 outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2064 wp += run;
2065 }
2066 last = pos + 1;
2067 }
2068 if last < chunk.len() {
2069 let run = chunk.len() - last;
2070 outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2071 wp += run;
2072 }
2073 writer.write_all(&outbuf[..wp])?;
2074 }
2075 Ok(())
2076}
2077
2078fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2079 let c0 = chars[0];
2080 let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
2081 let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
2082 let is_three = chars.len() >= 3;
2083
2084 if data.len() >= PARALLEL_THRESHOLD {
2086 let n_threads = rayon::current_num_threads().max(1);
2087 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2088
2089 let results: Vec<Vec<u8>> = data
2090 .par_chunks(chunk_size)
2091 .map(|chunk| {
2092 let mut out = Vec::with_capacity(chunk.len());
2093 let mut last = 0;
2094 if is_three {
2095 for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2096 if pos > last {
2097 out.extend_from_slice(&chunk[last..pos]);
2098 }
2099 last = pos + 1;
2100 }
2101 } else {
2102 for pos in memchr::memchr2_iter(c0, c1, chunk) {
2103 if pos > last {
2104 out.extend_from_slice(&chunk[last..pos]);
2105 }
2106 last = pos + 1;
2107 }
2108 }
2109 if last < chunk.len() {
2110 out.extend_from_slice(&chunk[last..]);
2111 }
2112 out
2113 })
2114 .collect();
2115
2116 let slices: Vec<std::io::IoSlice> = results
2118 .iter()
2119 .filter(|r| !r.is_empty())
2120 .map(|r| std::io::IoSlice::new(r))
2121 .collect();
2122 return write_ioslices(writer, &slices);
2123 }
2124
2125 if data.len() <= SINGLE_WRITE_LIMIT {
2127 let mut outbuf = Vec::with_capacity(data.len());
2128 let mut last = 0;
2129 if is_three {
2130 for pos in memchr::memchr3_iter(c0, c1, c2, data) {
2131 if pos > last {
2132 outbuf.extend_from_slice(&data[last..pos]);
2133 }
2134 last = pos + 1;
2135 }
2136 } else {
2137 for pos in memchr::memchr2_iter(c0, c1, data) {
2138 if pos > last {
2139 outbuf.extend_from_slice(&data[last..pos]);
2140 }
2141 last = pos + 1;
2142 }
2143 }
2144 if last < data.len() {
2145 outbuf.extend_from_slice(&data[last..]);
2146 }
2147 return writer.write_all(&outbuf);
2148 }
2149
2150 let buf_size = data.len().min(BUF_SIZE);
2152 let mut outbuf = vec![0u8; buf_size];
2153
2154 for chunk in data.chunks(buf_size) {
2155 let mut wp = 0;
2156 let mut last = 0;
2157
2158 if is_three {
2161 for pos in memchr::memchr3_iter(c0, c1, c2, chunk) {
2162 if pos > last {
2163 let run = pos - last;
2164 outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2165 wp += run;
2166 }
2167 last = pos + 1;
2168 }
2169 } else {
2170 for pos in memchr::memchr2_iter(c0, c1, chunk) {
2171 if pos > last {
2172 let run = pos - last;
2173 outbuf[wp..wp + run].copy_from_slice(&chunk[last..pos]);
2174 wp += run;
2175 }
2176 last = pos + 1;
2177 }
2178 }
2179
2180 if last < chunk.len() {
2181 let run = chunk.len() - last;
2182 outbuf[wp..wp + run].copy_from_slice(&chunk[last..]);
2183 wp += run;
2184 }
2185 writer.write_all(&outbuf[..wp])?;
2186 }
2187 Ok(())
2188}
2189
2190pub fn delete_squeeze_mmap(
2195 delete_chars: &[u8],
2196 squeeze_chars: &[u8],
2197 data: &[u8],
2198 writer: &mut impl Write,
2199) -> io::Result<()> {
2200 let delete_set = build_member_set(delete_chars);
2201 let squeeze_set = build_member_set(squeeze_chars);
2202
2203 if data.len() <= SINGLE_WRITE_LIMIT {
2205 let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2206 let mut last_squeezed: u16 = 256;
2207 unsafe {
2208 outbuf.set_len(data.len());
2209 let outp: *mut u8 = outbuf.as_mut_ptr();
2210 let inp = data.as_ptr();
2211 let len = data.len();
2212 let mut out_pos = 0;
2213 let mut i = 0;
2214 while i < len {
2215 let b = *inp.add(i);
2216 if is_member(&delete_set, b) {
2217 i += 1;
2218 continue;
2219 }
2220 if is_member(&squeeze_set, b) {
2221 if last_squeezed == b as u16 {
2222 i += 1;
2223 continue;
2224 }
2225 last_squeezed = b as u16;
2226 } else {
2227 last_squeezed = 256;
2228 }
2229 *outp.add(out_pos) = b;
2230 out_pos += 1;
2231 i += 1;
2232 }
2233 outbuf.set_len(out_pos);
2234 }
2235 return writer.write_all(&outbuf);
2236 }
2237
2238 let buf_size = data.len().min(BUF_SIZE);
2240 let mut outbuf = vec![0u8; buf_size];
2241 let mut last_squeezed: u16 = 256;
2242
2243 for chunk in data.chunks(buf_size) {
2244 let mut out_pos = 0;
2245 for &b in chunk {
2246 if is_member(&delete_set, b) {
2247 continue;
2248 }
2249 if is_member(&squeeze_set, b) {
2250 if last_squeezed == b as u16 {
2251 continue;
2252 }
2253 last_squeezed = b as u16;
2254 } else {
2255 last_squeezed = 256;
2256 }
2257 unsafe {
2258 *outbuf.get_unchecked_mut(out_pos) = b;
2259 }
2260 out_pos += 1;
2261 }
2262 writer.write_all(&outbuf[..out_pos])?;
2263 }
2264 Ok(())
2265}
2266
2267pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2273 if squeeze_chars.len() == 1 {
2274 return squeeze_single_mmap(squeeze_chars[0], data, writer);
2275 }
2276 if squeeze_chars.len() == 2 {
2277 return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
2278 }
2279 if squeeze_chars.len() == 3 {
2280 return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
2281 }
2282
2283 let member = build_member_set(squeeze_chars);
2284
2285 if data.len() >= PARALLEL_THRESHOLD {
2287 let n_threads = rayon::current_num_threads().max(1);
2288 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2289
2290 let results: Vec<Vec<u8>> = data
2291 .par_chunks(chunk_size)
2292 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2293 .collect();
2294
2295 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2300 for (idx, result) in results.iter().enumerate() {
2301 if result.is_empty() {
2302 continue;
2303 }
2304 if idx > 0 {
2305 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2307 if is_member(&member, prev_last) {
2308 let skip = result.iter().take_while(|&&b| b == prev_last).count();
2310 if skip < result.len() {
2311 slices.push(std::io::IoSlice::new(&result[skip..]));
2312 }
2313 continue;
2314 }
2315 }
2316 }
2317 slices.push(std::io::IoSlice::new(result));
2318 }
2319 return write_ioslices(writer, &slices);
2320 }
2321
2322 if data.len() <= SINGLE_WRITE_LIMIT {
2324 let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2325 let mut last_squeezed: u16 = 256;
2326 let len = data.len();
2327 let mut wp = 0;
2328 let mut i = 0;
2329
2330 unsafe {
2331 outbuf.set_len(data.len());
2332 let inp = data.as_ptr();
2333 let outp: *mut u8 = outbuf.as_mut_ptr();
2334
2335 while i < len {
2336 let b = *inp.add(i);
2337 if is_member(&member, b) {
2338 if last_squeezed != b as u16 {
2339 *outp.add(wp) = b;
2340 wp += 1;
2341 last_squeezed = b as u16;
2342 }
2343 i += 1;
2344 while i < len && *inp.add(i) == b {
2345 i += 1;
2346 }
2347 } else {
2348 last_squeezed = 256;
2349 *outp.add(wp) = b;
2350 wp += 1;
2351 i += 1;
2352 }
2353 }
2354 outbuf.set_len(wp);
2355 }
2356 return writer.write_all(&outbuf);
2357 }
2358
2359 let buf_size = data.len().min(BUF_SIZE);
2361 let mut outbuf = vec![0u8; buf_size];
2362 let mut last_squeezed: u16 = 256;
2363
2364 for chunk in data.chunks(buf_size) {
2365 let len = chunk.len();
2366 let mut wp = 0;
2367 let mut i = 0;
2368
2369 unsafe {
2370 let inp = chunk.as_ptr();
2371 let outp = outbuf.as_mut_ptr();
2372
2373 while i < len {
2374 let b = *inp.add(i);
2375 if is_member(&member, b) {
2376 if last_squeezed != b as u16 {
2377 *outp.add(wp) = b;
2378 wp += 1;
2379 last_squeezed = b as u16;
2380 }
2381 i += 1;
2382 while i < len && *inp.add(i) == b {
2383 i += 1;
2384 }
2385 } else {
2386 last_squeezed = 256;
2387 *outp.add(wp) = b;
2388 wp += 1;
2389 i += 1;
2390 }
2391 }
2392 }
2393 writer.write_all(&outbuf[..wp])?;
2394 }
2395 Ok(())
2396}
2397
2398fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
2400 let len = chunk.len();
2401 let mut out = Vec::with_capacity(len);
2402 let mut last_squeezed: u16 = 256;
2403 let mut i = 0;
2404
2405 unsafe {
2406 out.set_len(len);
2407 let inp = chunk.as_ptr();
2408 let outp: *mut u8 = out.as_mut_ptr();
2409 let mut wp = 0;
2410
2411 while i < len {
2412 let b = *inp.add(i);
2413 if is_member(member, b) {
2414 if last_squeezed != b as u16 {
2415 *outp.add(wp) = b;
2416 wp += 1;
2417 last_squeezed = b as u16;
2418 }
2419 i += 1;
2420 while i < len && *inp.add(i) == b {
2421 i += 1;
2422 }
2423 } else {
2424 last_squeezed = 256;
2425 *outp.add(wp) = b;
2426 wp += 1;
2427 i += 1;
2428 }
2429 }
2430 out.set_len(wp);
2431 }
2432 out
2433}
2434
2435fn squeeze_multi_mmap<const N: usize>(
2436 chars: &[u8],
2437 data: &[u8],
2438 writer: &mut impl Write,
2439) -> io::Result<()> {
2440 if data.len() >= PARALLEL_THRESHOLD {
2442 let member = build_member_set(chars);
2443 let n_threads = rayon::current_num_threads().max(1);
2444 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2445
2446 let results: Vec<Vec<u8>> = data
2447 .par_chunks(chunk_size)
2448 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2449 .collect();
2450
2451 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2453 for (idx, result) in results.iter().enumerate() {
2454 if result.is_empty() {
2455 continue;
2456 }
2457 if idx > 0 {
2458 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2459 if is_member(&member, prev_last) {
2460 let skip = result.iter().take_while(|&&b| b == prev_last).count();
2461 if skip < result.len() {
2462 slices.push(std::io::IoSlice::new(&result[skip..]));
2463 }
2464 continue;
2465 }
2466 }
2467 }
2468 slices.push(std::io::IoSlice::new(result));
2469 }
2470 return write_ioslices(writer, &slices);
2471 }
2472
2473 let buf_size = data.len().min(BUF_SIZE);
2474 let mut outbuf = vec![0u8; buf_size];
2475 let mut wp = 0;
2476 let mut last_squeezed: u16 = 256;
2477 let mut cursor = 0;
2478
2479 macro_rules! find_next {
2480 ($data:expr) => {
2481 if N == 2 {
2482 memchr::memchr2(chars[0], chars[1], $data)
2483 } else {
2484 memchr::memchr3(chars[0], chars[1], chars[2], $data)
2485 }
2486 };
2487 }
2488
2489 macro_rules! flush_and_copy {
2490 ($src:expr, $len:expr) => {
2491 if wp + $len > buf_size {
2492 writer.write_all(&outbuf[..wp])?;
2493 wp = 0;
2494 }
2495 if $len > buf_size {
2496 writer.write_all($src)?;
2497 } else {
2498 outbuf[wp..wp + $len].copy_from_slice($src);
2499 wp += $len;
2500 }
2501 };
2502 }
2503
2504 while cursor < data.len() {
2505 match find_next!(&data[cursor..]) {
2506 Some(offset) => {
2507 let pos = cursor + offset;
2508 let b = data[pos];
2509 if pos > cursor {
2510 let span = pos - cursor;
2511 flush_and_copy!(&data[cursor..pos], span);
2512 last_squeezed = 256;
2513 }
2514 if last_squeezed != b as u16 {
2515 if wp >= buf_size {
2516 writer.write_all(&outbuf[..wp])?;
2517 wp = 0;
2518 }
2519 outbuf[wp] = b;
2520 wp += 1;
2521 last_squeezed = b as u16;
2522 }
2523 let mut skip = pos + 1;
2524 while skip < data.len() && data[skip] == b {
2525 skip += 1;
2526 }
2527 cursor = skip;
2528 }
2529 None => {
2530 let remaining = data.len() - cursor;
2531 flush_and_copy!(&data[cursor..], remaining);
2532 break;
2533 }
2534 }
2535 }
2536 if wp > 0 {
2537 writer.write_all(&outbuf[..wp])?;
2538 }
2539 Ok(())
2540}
2541
2542fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2543 if data.is_empty() {
2544 return Ok(());
2545 }
2546
2547 if memchr::memmem::find(data, &[ch, ch]).is_none() {
2548 return writer.write_all(data);
2549 }
2550
2551 if data.len() >= PARALLEL_THRESHOLD {
2553 let n_threads = rayon::current_num_threads().max(1);
2554 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2555
2556 let results: Vec<Vec<u8>> = data
2557 .par_chunks(chunk_size)
2558 .map(|chunk| {
2559 let mut out = Vec::with_capacity(chunk.len());
2560 let mut cursor = 0;
2561 while cursor < chunk.len() {
2562 match memchr::memchr(ch, &chunk[cursor..]) {
2563 Some(offset) => {
2564 let pos = cursor + offset;
2565 if pos > cursor {
2566 out.extend_from_slice(&chunk[cursor..pos]);
2567 }
2568 out.push(ch);
2569 cursor = pos + 1;
2570 while cursor < chunk.len() && chunk[cursor] == ch {
2571 cursor += 1;
2572 }
2573 }
2574 None => {
2575 out.extend_from_slice(&chunk[cursor..]);
2576 break;
2577 }
2578 }
2579 }
2580 out
2581 })
2582 .collect();
2583
2584 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2587 for (idx, result) in results.iter().enumerate() {
2588 if result.is_empty() {
2589 continue;
2590 }
2591 if idx > 0 {
2592 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2593 if prev_last == ch {
2594 let skip = result.iter().take_while(|&&b| b == ch).count();
2596 if skip < result.len() {
2597 slices.push(std::io::IoSlice::new(&result[skip..]));
2598 }
2599 continue;
2600 }
2601 }
2602 }
2603 slices.push(std::io::IoSlice::new(result));
2604 }
2605 return write_ioslices(writer, &slices);
2606 }
2607
2608 let buf_size = data.len().min(BUF_SIZE);
2609 let mut outbuf = vec![0u8; buf_size];
2610 let len = data.len();
2611 let mut wp = 0;
2612 let mut cursor = 0;
2613
2614 while cursor < len {
2615 match memchr::memchr(ch, &data[cursor..]) {
2616 Some(offset) => {
2617 let pos = cursor + offset;
2618 let gap = pos - cursor;
2619 if gap > 0 {
2620 if wp + gap > buf_size {
2621 writer.write_all(&outbuf[..wp])?;
2622 wp = 0;
2623 }
2624 if gap > buf_size {
2625 writer.write_all(&data[cursor..pos])?;
2626 } else {
2627 outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
2628 wp += gap;
2629 }
2630 }
2631 if wp >= buf_size {
2632 writer.write_all(&outbuf[..wp])?;
2633 wp = 0;
2634 }
2635 outbuf[wp] = ch;
2636 wp += 1;
2637 cursor = pos + 1;
2638 while cursor < len && data[cursor] == ch {
2639 cursor += 1;
2640 }
2641 }
2642 None => {
2643 let remaining = len - cursor;
2644 if remaining > 0 {
2645 if wp + remaining > buf_size {
2646 writer.write_all(&outbuf[..wp])?;
2647 wp = 0;
2648 }
2649 if remaining > buf_size {
2650 writer.write_all(&data[cursor..])?;
2651 } else {
2652 outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
2653 wp += remaining;
2654 }
2655 }
2656 break;
2657 }
2658 }
2659 }
2660
2661 if wp > 0 {
2662 writer.write_all(&outbuf[..wp])?;
2663 }
2664 Ok(())
2665}