1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5const MAX_IOV: usize = 1024;
8
9const BUF_SIZE: usize = 4 * 1024 * 1024;
11
12const STREAM_BUF: usize = 16 * 1024 * 1024;
19
20const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
23
24#[inline]
27fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
28 if slices.is_empty() {
29 return Ok(());
30 }
31 for batch in slices.chunks(MAX_IOV) {
32 let total: usize = batch.iter().map(|s| s.len()).sum();
33 match writer.write_vectored(batch) {
34 Ok(n) if n >= total => continue,
35 Ok(mut written) => {
36 for slice in batch {
38 let slen = slice.len();
39 if written >= slen {
40 written -= slen;
41 continue;
42 }
43 if written > 0 {
44 writer.write_all(&slice[written..])?;
45 written = 0;
46 } else {
47 writer.write_all(slice)?;
48 }
49 }
50 }
51 Err(e) => return Err(e),
52 }
53 }
54 Ok(())
55}
56
57#[inline]
60#[allow(clippy::uninit_vec)]
61fn alloc_uninit_vec(len: usize) -> Vec<u8> {
62 let mut v = Vec::with_capacity(len);
63 unsafe {
65 v.set_len(len);
66 }
67 v
68}
69
70#[inline]
72fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
73 let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
74 let last = set2.last().copied();
75 for (i, &from) in set1.iter().enumerate() {
76 table[from as usize] = if i < set2.len() {
77 set2[i]
78 } else {
79 last.unwrap_or(from)
80 };
81 }
82 table
83}
84
85#[inline]
87fn build_member_set(chars: &[u8]) -> [u8; 32] {
88 let mut set = [0u8; 32];
89 for &ch in chars {
90 set[ch as usize >> 3] |= 1 << (ch & 7);
91 }
92 set
93}
94
95#[inline(always)]
96fn is_member(set: &[u8; 32], ch: u8) -> bool {
97 unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
98}
99
100#[inline(always)]
104fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
105 #[cfg(target_arch = "x86_64")]
106 {
107 if is_x86_feature_detected!("avx2") {
108 unsafe { translate_inplace_avx2_table(data, table) };
109 return;
110 }
111 if is_x86_feature_detected!("ssse3") {
112 unsafe { translate_inplace_ssse3_table(data, table) };
113 return;
114 }
115 }
116 translate_inplace_scalar(data, table);
117}
118
119#[inline(always)]
121fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
122 let len = data.len();
123 let ptr = data.as_mut_ptr();
124 let mut i = 0;
125 unsafe {
126 while i + 8 <= len {
127 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
128 *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
129 *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
130 *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
131 *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
132 *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
133 *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
134 *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
135 i += 8;
136 }
137 while i < len {
138 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
139 i += 1;
140 }
141 }
142}
143
144#[cfg(target_arch = "x86_64")]
164#[target_feature(enable = "avx2")]
165unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
166 use std::arch::x86_64::*;
167
168 unsafe {
169 let len = data.len();
170 let ptr = data.as_mut_ptr();
171
172 let mut lut = [_mm256_setzero_si256(); 16];
176 for h in 0u8..16 {
177 let base = (h as usize) * 16;
178 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
179 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
181 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
182 }
183
184 let lo_mask = _mm256_set1_epi8(0x0F);
185
186 let mut i = 0;
187 while i + 32 <= len {
188 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
189 let lo_nibble = _mm256_and_si256(input, lo_mask);
190 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
191
192 let mut result = _mm256_setzero_si256();
195
196 macro_rules! do_nibble {
198 ($h:expr) => {
199 let h_val = _mm256_set1_epi8($h as i8);
200 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
201 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
202 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
203 };
204 }
205 do_nibble!(0);
206 do_nibble!(1);
207 do_nibble!(2);
208 do_nibble!(3);
209 do_nibble!(4);
210 do_nibble!(5);
211 do_nibble!(6);
212 do_nibble!(7);
213 do_nibble!(8);
214 do_nibble!(9);
215 do_nibble!(10);
216 do_nibble!(11);
217 do_nibble!(12);
218 do_nibble!(13);
219 do_nibble!(14);
220 do_nibble!(15);
221
222 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
223 i += 32;
224 }
225
226 if i + 16 <= len {
228 let lo_mask128 = _mm_set1_epi8(0x0F);
229
230 let mut lut128 = [_mm_setzero_si128(); 16];
232 for h in 0u8..16 {
233 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
234 }
235
236 let input = _mm_loadu_si128(ptr.add(i) as *const _);
237 let lo_nib = _mm_and_si128(input, lo_mask128);
238 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
239
240 let mut res = _mm_setzero_si128();
241 macro_rules! do_nibble128 {
242 ($h:expr) => {
243 let h_val = _mm_set1_epi8($h as i8);
244 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
245 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
246 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
247 };
248 }
249 do_nibble128!(0);
250 do_nibble128!(1);
251 do_nibble128!(2);
252 do_nibble128!(3);
253 do_nibble128!(4);
254 do_nibble128!(5);
255 do_nibble128!(6);
256 do_nibble128!(7);
257 do_nibble128!(8);
258 do_nibble128!(9);
259 do_nibble128!(10);
260 do_nibble128!(11);
261 do_nibble128!(12);
262 do_nibble128!(13);
263 do_nibble128!(14);
264 do_nibble128!(15);
265
266 _mm_storeu_si128(ptr.add(i) as *mut _, res);
267 i += 16;
268 }
269
270 while i < len {
272 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
273 i += 1;
274 }
275 }
276}
277
278#[cfg(target_arch = "x86_64")]
279#[target_feature(enable = "ssse3")]
280unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
281 use std::arch::x86_64::*;
282
283 unsafe {
284 let len = data.len();
285 let ptr = data.as_mut_ptr();
286
287 let mut lut = [_mm_setzero_si128(); 16];
289 for h in 0u8..16 {
290 let base = (h as usize) * 16;
291 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
292 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
293 }
294
295 let lo_mask = _mm_set1_epi8(0x0F);
296
297 let mut i = 0;
298 while i + 16 <= len {
299 let input = _mm_loadu_si128(ptr.add(i) as *const _);
300 let lo_nibble = _mm_and_si128(input, lo_mask);
301 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
302
303 let mut result = _mm_setzero_si128();
304
305 macro_rules! do_nibble {
306 ($h:expr) => {
307 let h_val = _mm_set1_epi8($h as i8);
308 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
309 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
310 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
311 };
312 }
313 do_nibble!(0);
314 do_nibble!(1);
315 do_nibble!(2);
316 do_nibble!(3);
317 do_nibble!(4);
318 do_nibble!(5);
319 do_nibble!(6);
320 do_nibble!(7);
321 do_nibble!(8);
322 do_nibble!(9);
323 do_nibble!(10);
324 do_nibble!(11);
325 do_nibble!(12);
326 do_nibble!(13);
327 do_nibble!(14);
328 do_nibble!(15);
329
330 _mm_storeu_si128(ptr.add(i) as *mut _, result);
331 i += 16;
332 }
333
334 while i < len {
336 *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
337 i += 1;
338 }
339 }
340}
341
342#[inline(always)]
345fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
346 debug_assert!(dst.len() >= src.len());
347 #[cfg(target_arch = "x86_64")]
348 {
349 if is_x86_feature_detected!("avx2") {
350 unsafe { translate_to_avx2_table(src, dst, table) };
351 return;
352 }
353 if is_x86_feature_detected!("ssse3") {
354 unsafe { translate_to_ssse3_table(src, dst, table) };
355 return;
356 }
357 }
358 translate_to_scalar(src, dst, table);
359}
360
361#[inline(always)]
363fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
364 unsafe {
365 let sp = src.as_ptr();
366 let dp = dst.as_mut_ptr();
367 let len = src.len();
368 let mut i = 0;
369 while i + 8 <= len {
370 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
371 *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
372 *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
373 *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
374 *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
375 *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
376 *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
377 *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
378 i += 8;
379 }
380 while i < len {
381 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
382 i += 1;
383 }
384 }
385}
386
387#[cfg(target_arch = "x86_64")]
388#[target_feature(enable = "avx2")]
389unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
390 use std::arch::x86_64::*;
391
392 unsafe {
393 let len = src.len();
394 let sp = src.as_ptr();
395 let dp = dst.as_mut_ptr();
396
397 let mut lut = [_mm256_setzero_si256(); 16];
399 for h in 0u8..16 {
400 let base = (h as usize) * 16;
401 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
402 let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
403 lut[h as usize] = _mm256_broadcastsi128_si256(row128);
404 }
405
406 let lo_mask = _mm256_set1_epi8(0x0F);
407
408 let mut i = 0;
409 while i + 32 <= len {
410 let input = _mm256_loadu_si256(sp.add(i) as *const _);
411 let lo_nibble = _mm256_and_si256(input, lo_mask);
412 let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
413
414 let mut result = _mm256_setzero_si256();
415
416 macro_rules! do_nibble {
417 ($h:expr) => {
418 let h_val = _mm256_set1_epi8($h as i8);
419 let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
420 let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
421 result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
422 };
423 }
424 do_nibble!(0);
425 do_nibble!(1);
426 do_nibble!(2);
427 do_nibble!(3);
428 do_nibble!(4);
429 do_nibble!(5);
430 do_nibble!(6);
431 do_nibble!(7);
432 do_nibble!(8);
433 do_nibble!(9);
434 do_nibble!(10);
435 do_nibble!(11);
436 do_nibble!(12);
437 do_nibble!(13);
438 do_nibble!(14);
439 do_nibble!(15);
440
441 _mm256_storeu_si256(dp.add(i) as *mut _, result);
442 i += 32;
443 }
444
445 if i + 16 <= len {
447 let lo_mask128 = _mm_set1_epi8(0x0F);
448 let mut lut128 = [_mm_setzero_si128(); 16];
449 for h in 0u8..16 {
450 lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
451 }
452
453 let input = _mm_loadu_si128(sp.add(i) as *const _);
454 let lo_nib = _mm_and_si128(input, lo_mask128);
455 let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
456
457 let mut res = _mm_setzero_si128();
458 macro_rules! do_nibble128 {
459 ($h:expr) => {
460 let h_val = _mm_set1_epi8($h as i8);
461 let mask = _mm_cmpeq_epi8(hi_nib, h_val);
462 let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
463 res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
464 };
465 }
466 do_nibble128!(0);
467 do_nibble128!(1);
468 do_nibble128!(2);
469 do_nibble128!(3);
470 do_nibble128!(4);
471 do_nibble128!(5);
472 do_nibble128!(6);
473 do_nibble128!(7);
474 do_nibble128!(8);
475 do_nibble128!(9);
476 do_nibble128!(10);
477 do_nibble128!(11);
478 do_nibble128!(12);
479 do_nibble128!(13);
480 do_nibble128!(14);
481 do_nibble128!(15);
482
483 _mm_storeu_si128(dp.add(i) as *mut _, res);
484 i += 16;
485 }
486
487 while i < len {
489 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
490 i += 1;
491 }
492 }
493}
494
495#[cfg(target_arch = "x86_64")]
496#[target_feature(enable = "ssse3")]
497unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
498 use std::arch::x86_64::*;
499
500 unsafe {
501 let len = src.len();
502 let sp = src.as_ptr();
503 let dp = dst.as_mut_ptr();
504
505 let mut lut = [_mm_setzero_si128(); 16];
506 for h in 0u8..16 {
507 let base = (h as usize) * 16;
508 let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
509 lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
510 }
511
512 let lo_mask = _mm_set1_epi8(0x0F);
513
514 let mut i = 0;
515 while i + 16 <= len {
516 let input = _mm_loadu_si128(sp.add(i) as *const _);
517 let lo_nibble = _mm_and_si128(input, lo_mask);
518 let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
519
520 let mut result = _mm_setzero_si128();
521
522 macro_rules! do_nibble {
523 ($h:expr) => {
524 let h_val = _mm_set1_epi8($h as i8);
525 let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
526 let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
527 result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
528 };
529 }
530 do_nibble!(0);
531 do_nibble!(1);
532 do_nibble!(2);
533 do_nibble!(3);
534 do_nibble!(4);
535 do_nibble!(5);
536 do_nibble!(6);
537 do_nibble!(7);
538 do_nibble!(8);
539 do_nibble!(9);
540 do_nibble!(10);
541 do_nibble!(11);
542 do_nibble!(12);
543 do_nibble!(13);
544 do_nibble!(14);
545 do_nibble!(15);
546
547 _mm_storeu_si128(dp.add(i) as *mut _, result);
548 i += 16;
549 }
550
551 while i < len {
553 *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
554 i += 1;
555 }
556 }
557}
558
559#[inline]
567fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
568 let mut lo: Option<u8> = None;
569 let mut hi = 0u8;
570 let mut offset = 0i16;
571
572 for i in 0..256 {
573 if table[i] != i as u8 {
574 let diff = table[i] as i16 - i as i16;
575 match lo {
576 None => {
577 lo = Some(i as u8);
578 hi = i as u8;
579 offset = diff;
580 }
581 Some(_) => {
582 if diff != offset || i as u8 != hi.wrapping_add(1) {
583 return None;
584 }
585 hi = i as u8;
586 }
587 }
588 }
589 }
590
591 lo.map(|l| (l, hi, offset as i8))
592}
593
594#[cfg(target_arch = "x86_64")]
598fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
599 if is_x86_feature_detected!("avx2") {
600 unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
601 } else {
602 unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
603 }
604}
605
606#[cfg(target_arch = "x86_64")]
607#[target_feature(enable = "avx2")]
608unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
609 use std::arch::x86_64::*;
610
611 unsafe {
612 let range = hi - lo;
613 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
618 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
619 let offset_v = _mm256_set1_epi8(offset);
620 let zero = _mm256_setzero_si256();
621
622 let len = src.len();
623 let mut i = 0;
624
625 while i + 32 <= len {
626 let input = _mm256_loadu_si256(src.as_ptr().add(i) as *const _);
627 let biased = _mm256_add_epi8(input, bias_v);
628 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
630 let mask = _mm256_cmpeq_epi8(gt, zero);
632 let offset_masked = _mm256_and_si256(mask, offset_v);
633 let result = _mm256_add_epi8(input, offset_masked);
634 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut _, result);
635 i += 32;
636 }
637
638 if i + 16 <= len {
640 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
641 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
642 let offset_v128 = _mm_set1_epi8(offset);
643 let zero128 = _mm_setzero_si128();
644
645 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
646 let biased = _mm_add_epi8(input, bias_v128);
647 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
648 let mask = _mm_cmpeq_epi8(gt, zero128);
649 let offset_masked = _mm_and_si128(mask, offset_v128);
650 let result = _mm_add_epi8(input, offset_masked);
651 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
652 i += 16;
653 }
654
655 while i < len {
657 let b = *src.get_unchecked(i);
658 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
659 b.wrapping_add(offset as u8)
660 } else {
661 b
662 };
663 i += 1;
664 }
665 }
666}
667
668#[cfg(target_arch = "x86_64")]
669#[target_feature(enable = "sse2")]
670unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
671 use std::arch::x86_64::*;
672
673 unsafe {
674 let range = hi - lo;
675 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
676 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
677 let offset_v = _mm_set1_epi8(offset);
678 let zero = _mm_setzero_si128();
679
680 let len = src.len();
681 let mut i = 0;
682
683 while i + 16 <= len {
684 let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
685 let biased = _mm_add_epi8(input, bias_v);
686 let gt = _mm_cmpgt_epi8(biased, threshold_v);
687 let mask = _mm_cmpeq_epi8(gt, zero);
688 let offset_masked = _mm_and_si128(mask, offset_v);
689 let result = _mm_add_epi8(input, offset_masked);
690 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
691 i += 16;
692 }
693
694 while i < len {
695 let b = *src.get_unchecked(i);
696 *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
697 b.wrapping_add(offset as u8)
698 } else {
699 b
700 };
701 i += 1;
702 }
703 }
704}
705
706#[cfg(not(target_arch = "x86_64"))]
708fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
709 for (i, &b) in src.iter().enumerate() {
710 dst[i] = if b >= lo && b <= hi {
711 b.wrapping_add(offset as u8)
712 } else {
713 b
714 };
715 }
716}
717
718#[cfg(target_arch = "x86_64")]
726fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
727 if is_x86_feature_detected!("avx2") {
728 unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
729 } else {
730 unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
731 }
732}
733
734#[cfg(target_arch = "x86_64")]
735#[target_feature(enable = "avx2")]
736unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
737 use std::arch::x86_64::*;
738
739 unsafe {
740 let range = hi - lo;
741 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
742 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
743 let offset_v = _mm256_set1_epi8(offset);
744 let zero = _mm256_setzero_si256();
745
746 let len = data.len();
747 let ptr = data.as_mut_ptr();
748 let mut i = 0;
749
750 while i + 32 <= len {
751 let input = _mm256_loadu_si256(ptr.add(i) as *const _);
752 let biased = _mm256_add_epi8(input, bias_v);
753 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
754 let mask = _mm256_cmpeq_epi8(gt, zero);
755 let offset_masked = _mm256_and_si256(mask, offset_v);
756 let result = _mm256_add_epi8(input, offset_masked);
757 _mm256_storeu_si256(ptr.add(i) as *mut _, result);
758 i += 32;
759 }
760
761 if i + 16 <= len {
762 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
763 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
764 let offset_v128 = _mm_set1_epi8(offset);
765 let zero128 = _mm_setzero_si128();
766
767 let input = _mm_loadu_si128(ptr.add(i) as *const _);
768 let biased = _mm_add_epi8(input, bias_v128);
769 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
770 let mask = _mm_cmpeq_epi8(gt, zero128);
771 let offset_masked = _mm_and_si128(mask, offset_v128);
772 let result = _mm_add_epi8(input, offset_masked);
773 _mm_storeu_si128(ptr.add(i) as *mut _, result);
774 i += 16;
775 }
776
777 while i < len {
778 let b = *ptr.add(i);
779 *ptr.add(i) = if b >= lo && b <= hi {
780 b.wrapping_add(offset as u8)
781 } else {
782 b
783 };
784 i += 1;
785 }
786 }
787}
788
789#[cfg(target_arch = "x86_64")]
790#[target_feature(enable = "sse2")]
791unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
792 use std::arch::x86_64::*;
793
794 unsafe {
795 let range = hi - lo;
796 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
797 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
798 let offset_v = _mm_set1_epi8(offset);
799 let zero = _mm_setzero_si128();
800
801 let len = data.len();
802 let ptr = data.as_mut_ptr();
803 let mut i = 0;
804
805 while i + 16 <= len {
806 let input = _mm_loadu_si128(ptr.add(i) as *const _);
807 let biased = _mm_add_epi8(input, bias_v);
808 let gt = _mm_cmpgt_epi8(biased, threshold_v);
809 let mask = _mm_cmpeq_epi8(gt, zero);
810 let offset_masked = _mm_and_si128(mask, offset_v);
811 let result = _mm_add_epi8(input, offset_masked);
812 _mm_storeu_si128(ptr.add(i) as *mut _, result);
813 i += 16;
814 }
815
816 while i < len {
817 let b = *ptr.add(i);
818 *ptr.add(i) = if b >= lo && b <= hi {
819 b.wrapping_add(offset as u8)
820 } else {
821 b
822 };
823 i += 1;
824 }
825 }
826}
827
828#[cfg(not(target_arch = "x86_64"))]
829fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
830 for b in data.iter_mut() {
831 if *b >= lo && *b <= hi {
832 *b = b.wrapping_add(offset as u8);
833 }
834 }
835}
836
837#[inline]
847fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
848 if chars.is_empty() {
849 return None;
850 }
851 let mut lo = chars[0];
852 let mut hi = chars[0];
853 for &c in &chars[1..] {
854 if c < lo {
855 lo = c;
856 }
857 if c > hi {
858 hi = c;
859 }
860 }
861 if (hi - lo + 1) as usize == chars.len() {
863 Some((lo, hi))
864 } else {
865 None
866 }
867}
868
869#[cfg(target_arch = "x86_64")]
873fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
874 if is_x86_feature_detected!("avx2") {
875 unsafe { delete_range_avx2(src, dst, lo, hi) }
876 } else {
877 unsafe { delete_range_sse2(src, dst, lo, hi) }
878 }
879}
880
881#[cfg(target_arch = "x86_64")]
882#[target_feature(enable = "avx2")]
883unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
884 use std::arch::x86_64::*;
885
886 unsafe {
887 let range = hi - lo;
888 let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
889 let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
890 let zero = _mm256_setzero_si256();
891
892 let len = src.len();
893 let sp = src.as_ptr();
894 let dp = dst.as_mut_ptr();
895 let mut ri = 0;
896 let mut wp = 0;
897
898 while ri + 32 <= len {
899 let input = _mm256_loadu_si256(sp.add(ri) as *const _);
900 let biased = _mm256_add_epi8(input, bias_v);
901 let gt = _mm256_cmpgt_epi8(biased, threshold_v);
903 let in_range = _mm256_cmpeq_epi8(gt, zero);
905 let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
907
908 if keep_mask == 0xFFFFFFFF {
909 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
911 wp += 32;
912 } else if keep_mask != 0 {
913 compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
915 let c0 = (keep_mask as u8).count_ones() as usize;
916 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
917 let c1 = ((keep_mask >> 8) as u8).count_ones() as usize;
918 compact_8bytes(
919 sp.add(ri + 16),
920 dp.add(wp + c0 + c1),
921 (keep_mask >> 16) as u8,
922 );
923 let c2 = ((keep_mask >> 16) as u8).count_ones() as usize;
924 compact_8bytes(
925 sp.add(ri + 24),
926 dp.add(wp + c0 + c1 + c2),
927 (keep_mask >> 24) as u8,
928 );
929 let c3 = ((keep_mask >> 24) as u8).count_ones() as usize;
930 wp += c0 + c1 + c2 + c3;
931 }
932 ri += 32;
934 }
935
936 if ri + 16 <= len {
938 let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
939 let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
940 let zero128 = _mm_setzero_si128();
941
942 let input = _mm_loadu_si128(sp.add(ri) as *const _);
943 let biased = _mm_add_epi8(input, bias_v128);
944 let gt = _mm_cmpgt_epi8(biased, threshold_v128);
945 let in_range = _mm_cmpeq_epi8(gt, zero128);
946 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
947
948 if keep_mask == 0xFFFF {
949 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
950 wp += 16;
951 } else if keep_mask != 0 {
952 compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
953 let c0 = (keep_mask as u8).count_ones() as usize;
954 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
955 wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
956 }
957 ri += 16;
958 }
959
960 while ri < len {
962 let b = *sp.add(ri);
963 if b < lo || b > hi {
964 *dp.add(wp) = b;
965 wp += 1;
966 }
967 ri += 1;
968 }
969
970 wp
971 }
972}
973
974#[cfg(target_arch = "x86_64")]
979#[inline(always)]
980unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
981 unsafe {
986 let mut m = mask;
987 let mut w = 0;
988 while m != 0 {
989 let bit = m.trailing_zeros() as usize;
990 *dst.add(w) = *src.add(bit);
991 w += 1;
992 m &= m - 1;
993 }
994 }
995}
996
997#[cfg(target_arch = "x86_64")]
998#[target_feature(enable = "sse2")]
999unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1000 use std::arch::x86_64::*;
1001
1002 unsafe {
1003 let range = hi - lo;
1004 let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1005 let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1006 let zero = _mm_setzero_si128();
1007
1008 let len = src.len();
1009 let sp = src.as_ptr();
1010 let dp = dst.as_mut_ptr();
1011 let mut ri = 0;
1012 let mut wp = 0;
1013
1014 while ri + 16 <= len {
1015 let input = _mm_loadu_si128(sp.add(ri) as *const _);
1016 let biased = _mm_add_epi8(input, bias_v);
1017 let gt = _mm_cmpgt_epi8(biased, threshold_v);
1018 let in_range = _mm_cmpeq_epi8(gt, zero);
1019 let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
1020
1021 if keep_mask == 0xFFFF {
1022 std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
1024 wp += 16;
1025 } else if keep_mask != 0 {
1026 compact_8bytes(sp.add(ri), dp.add(wp), keep_mask as u8);
1027 let c0 = (keep_mask as u8).count_ones() as usize;
1028 compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), (keep_mask >> 8) as u8);
1029 wp += c0 + ((keep_mask >> 8) as u8).count_ones() as usize;
1030 }
1031 ri += 16;
1032 }
1033
1034 while ri < len {
1035 let b = *sp.add(ri);
1036 if b < lo || b > hi {
1037 *dp.add(wp) = b;
1038 wp += 1;
1039 }
1040 ri += 1;
1041 }
1042
1043 wp
1044 }
1045}
1046
1047#[cfg(not(target_arch = "x86_64"))]
1049fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
1050 let mut wp = 0;
1051 for &b in src {
1052 if b < lo || b > hi {
1053 dst[wp] = b;
1054 wp += 1;
1055 }
1056 }
1057 wp
1058}
1059
1060fn delete_range_streaming(
1065 lo: u8,
1066 hi: u8,
1067 reader: &mut impl Read,
1068 writer: &mut impl Write,
1069) -> io::Result<()> {
1070 let mut src = vec![0u8; STREAM_BUF];
1071 let mut dst = alloc_uninit_vec(STREAM_BUF);
1072 loop {
1073 let n = read_full(reader, &mut src)?;
1074 if n == 0 {
1075 break;
1076 }
1077 let wp = delete_range_chunk(&src[..n], &mut dst, lo, hi);
1078 if wp == n {
1079 writer.write_all(&src[..n])?;
1081 } else if wp > 0 {
1082 writer.write_all(&dst[..wp])?;
1083 }
1084 }
1085 Ok(())
1086}
1087
1088pub fn translate(
1093 set1: &[u8],
1094 set2: &[u8],
1095 reader: &mut impl Read,
1096 writer: &mut impl Write,
1097) -> io::Result<()> {
1098 let table = build_translate_table(set1, set2);
1099
1100 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1102 if is_identity {
1103 return passthrough_stream(reader, writer);
1104 }
1105
1106 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1108 return translate_range_stream(lo, hi, offset, reader, writer);
1109 }
1110
1111 let mut buf = vec![0u8; STREAM_BUF];
1119 loop {
1120 let n = read_full(reader, &mut buf)?;
1121 if n == 0 {
1122 break;
1123 }
1124 translate_inplace(&mut buf[..n], &table);
1125 writer.write_all(&buf[..n])?;
1126 }
1127 Ok(())
1128}
1129
1130fn translate_range_stream(
1133 lo: u8,
1134 hi: u8,
1135 offset: i8,
1136 reader: &mut impl Read,
1137 writer: &mut impl Write,
1138) -> io::Result<()> {
1139 let mut buf = vec![0u8; STREAM_BUF];
1140 loop {
1141 let n = read_full(reader, &mut buf)?;
1142 if n == 0 {
1143 break;
1144 }
1145 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1146 writer.write_all(&buf[..n])?;
1147 }
1148 Ok(())
1149}
1150
1151fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1154 let mut buf = vec![0u8; STREAM_BUF];
1155 loop {
1156 let n = read_full(reader, &mut buf)?;
1157 if n == 0 {
1158 break;
1159 }
1160 writer.write_all(&buf[..n])?;
1161 }
1162 Ok(())
1163}
1164
1165#[inline]
1168fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
1169 let n = reader.read(buf)?;
1171 if n == buf.len() || n == 0 {
1172 return Ok(n);
1173 }
1174 let mut total = n;
1176 while total < buf.len() {
1177 match reader.read(&mut buf[total..]) {
1178 Ok(0) => break,
1179 Ok(n) => total += n,
1180 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1181 Err(e) => return Err(e),
1182 }
1183 }
1184 Ok(total)
1185}
1186
1187pub fn translate_squeeze(
1188 set1: &[u8],
1189 set2: &[u8],
1190 reader: &mut impl Read,
1191 writer: &mut impl Write,
1192) -> io::Result<()> {
1193 let table = build_translate_table(set1, set2);
1194 let squeeze_set = build_member_set(set2);
1195
1196 let range_info = detect_range_offset(&table);
1202
1203 let mut buf = vec![0u8; STREAM_BUF];
1204 let mut last_squeezed: u16 = 256;
1205
1206 loop {
1207 let n = read_full(reader, &mut buf)?;
1208 if n == 0 {
1209 break;
1210 }
1211 if let Some((lo, hi, offset)) = range_info {
1213 translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
1214 } else {
1215 translate_inplace(&mut buf[..n], &table);
1216 }
1217 let mut wp = 0;
1219 unsafe {
1220 let ptr = buf.as_mut_ptr();
1221 for i in 0..n {
1222 let b = *ptr.add(i);
1223 if is_member(&squeeze_set, b) {
1224 if last_squeezed == b as u16 {
1225 continue;
1226 }
1227 last_squeezed = b as u16;
1228 } else {
1229 last_squeezed = 256;
1230 }
1231 *ptr.add(wp) = b;
1232 wp += 1;
1233 }
1234 }
1235 writer.write_all(&buf[..wp])?;
1236 }
1237 Ok(())
1238}
1239
1240pub fn delete(
1241 delete_chars: &[u8],
1242 reader: &mut impl Read,
1243 writer: &mut impl Write,
1244) -> io::Result<()> {
1245 if delete_chars.len() == 1 {
1246 return delete_single_streaming(delete_chars[0], reader, writer);
1247 }
1248 if delete_chars.len() <= 3 {
1249 return delete_multi_streaming(delete_chars, reader, writer);
1250 }
1251
1252 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1256 return delete_range_streaming(lo, hi, reader, writer);
1257 }
1258
1259 let member = build_member_set(delete_chars);
1260 let mut buf = vec![0u8; STREAM_BUF];
1261
1262 loop {
1263 let n = read_full(reader, &mut buf)?;
1264 if n == 0 {
1265 break;
1266 }
1267 let mut wp = 0;
1268 unsafe {
1269 let ptr = buf.as_mut_ptr();
1270 let mut i = 0;
1271 while i + 8 <= n {
1272 let b0 = *ptr.add(i);
1273 let b1 = *ptr.add(i + 1);
1274 let b2 = *ptr.add(i + 2);
1275 let b3 = *ptr.add(i + 3);
1276 let b4 = *ptr.add(i + 4);
1277 let b5 = *ptr.add(i + 5);
1278 let b6 = *ptr.add(i + 6);
1279 let b7 = *ptr.add(i + 7);
1280
1281 *ptr.add(wp) = b0;
1284 wp += !is_member(&member, b0) as usize;
1285 *ptr.add(wp) = b1;
1286 wp += !is_member(&member, b1) as usize;
1287 *ptr.add(wp) = b2;
1288 wp += !is_member(&member, b2) as usize;
1289 *ptr.add(wp) = b3;
1290 wp += !is_member(&member, b3) as usize;
1291 *ptr.add(wp) = b4;
1292 wp += !is_member(&member, b4) as usize;
1293 *ptr.add(wp) = b5;
1294 wp += !is_member(&member, b5) as usize;
1295 *ptr.add(wp) = b6;
1296 wp += !is_member(&member, b6) as usize;
1297 *ptr.add(wp) = b7;
1298 wp += !is_member(&member, b7) as usize;
1299 i += 8;
1300 }
1301 while i < n {
1302 let b = *ptr.add(i);
1303 *ptr.add(wp) = b;
1304 wp += !is_member(&member, b) as usize;
1305 i += 1;
1306 }
1307 }
1308 writer.write_all(&buf[..wp])?;
1309 }
1310 Ok(())
1311}
1312
1313fn delete_single_streaming(
1314 ch: u8,
1315 reader: &mut impl Read,
1316 writer: &mut impl Write,
1317) -> io::Result<()> {
1318 let mut src = vec![0u8; STREAM_BUF];
1319 let mut dst = alloc_uninit_vec(STREAM_BUF);
1320 loop {
1321 let n = read_full(reader, &mut src)?;
1322 if n == 0 {
1323 break;
1324 }
1325 let mut wp = 0;
1329 let mut i = 0;
1330 while i < n {
1331 match memchr::memchr(ch, &src[i..n]) {
1332 Some(offset) => {
1333 if offset > 0 {
1334 unsafe {
1335 std::ptr::copy_nonoverlapping(
1336 src.as_ptr().add(i),
1337 dst.as_mut_ptr().add(wp),
1338 offset,
1339 );
1340 }
1341 wp += offset;
1342 }
1343 i += offset + 1;
1344 }
1345 None => {
1346 let run_len = n - i;
1347 if run_len > 0 {
1348 unsafe {
1349 std::ptr::copy_nonoverlapping(
1350 src.as_ptr().add(i),
1351 dst.as_mut_ptr().add(wp),
1352 run_len,
1353 );
1354 }
1355 wp += run_len;
1356 }
1357 break;
1358 }
1359 }
1360 }
1361 if wp == n {
1363 writer.write_all(&src[..n])?;
1364 } else if wp > 0 {
1365 writer.write_all(&dst[..wp])?;
1366 }
1367 }
1368 Ok(())
1369}
1370
1371fn delete_multi_streaming(
1372 chars: &[u8],
1373 reader: &mut impl Read,
1374 writer: &mut impl Write,
1375) -> io::Result<()> {
1376 let mut src = vec![0u8; STREAM_BUF];
1377 let mut dst = alloc_uninit_vec(STREAM_BUF);
1378 loop {
1379 let n = read_full(reader, &mut src)?;
1380 if n == 0 {
1381 break;
1382 }
1383 let mut wp = 0;
1386 let mut i = 0;
1387 while i < n {
1388 let found = if chars.len() == 2 {
1389 memchr::memchr2(chars[0], chars[1], &src[i..n])
1390 } else {
1391 memchr::memchr3(chars[0], chars[1], chars[2], &src[i..n])
1392 };
1393 match found {
1394 Some(offset) => {
1395 if offset > 0 {
1396 unsafe {
1397 std::ptr::copy_nonoverlapping(
1398 src.as_ptr().add(i),
1399 dst.as_mut_ptr().add(wp),
1400 offset,
1401 );
1402 }
1403 wp += offset;
1404 }
1405 i += offset + 1;
1406 }
1407 None => {
1408 let run_len = n - i;
1409 if run_len > 0 {
1410 unsafe {
1411 std::ptr::copy_nonoverlapping(
1412 src.as_ptr().add(i),
1413 dst.as_mut_ptr().add(wp),
1414 run_len,
1415 );
1416 }
1417 wp += run_len;
1418 }
1419 break;
1420 }
1421 }
1422 }
1423 if wp == n {
1424 writer.write_all(&src[..n])?;
1425 } else if wp > 0 {
1426 writer.write_all(&dst[..wp])?;
1427 }
1428 }
1429 Ok(())
1430}
1431
1432pub fn delete_squeeze(
1433 delete_chars: &[u8],
1434 squeeze_chars: &[u8],
1435 reader: &mut impl Read,
1436 writer: &mut impl Write,
1437) -> io::Result<()> {
1438 let delete_set = build_member_set(delete_chars);
1439 let squeeze_set = build_member_set(squeeze_chars);
1440 let mut buf = vec![0u8; STREAM_BUF];
1441 let mut last_squeezed: u16 = 256;
1442
1443 loop {
1444 let n = read_full(reader, &mut buf)?;
1445 if n == 0 {
1446 break;
1447 }
1448 let mut wp = 0;
1449 unsafe {
1450 let ptr = buf.as_mut_ptr();
1451 for i in 0..n {
1452 let b = *ptr.add(i);
1453 if is_member(&delete_set, b) {
1454 continue;
1455 }
1456 if is_member(&squeeze_set, b) {
1457 if last_squeezed == b as u16 {
1458 continue;
1459 }
1460 last_squeezed = b as u16;
1461 } else {
1462 last_squeezed = 256;
1463 }
1464 *ptr.add(wp) = b;
1465 wp += 1;
1466 }
1467 }
1468 writer.write_all(&buf[..wp])?;
1469 }
1470 Ok(())
1471}
1472
1473pub fn squeeze(
1474 squeeze_chars: &[u8],
1475 reader: &mut impl Read,
1476 writer: &mut impl Write,
1477) -> io::Result<()> {
1478 if squeeze_chars.len() == 1 {
1479 return squeeze_single_stream(squeeze_chars[0], reader, writer);
1480 }
1481
1482 let member = build_member_set(squeeze_chars);
1483 let mut buf = vec![0u8; STREAM_BUF];
1484 let mut last_squeezed: u16 = 256;
1485
1486 loop {
1487 let n = read_full(reader, &mut buf)?;
1488 if n == 0 {
1489 break;
1490 }
1491 let mut wp = 0;
1492 unsafe {
1493 let ptr = buf.as_mut_ptr();
1494 for i in 0..n {
1495 let b = *ptr.add(i);
1496 if is_member(&member, b) {
1497 if last_squeezed == b as u16 {
1498 continue;
1499 }
1500 last_squeezed = b as u16;
1501 } else {
1502 last_squeezed = 256;
1503 }
1504 *ptr.add(wp) = b;
1505 wp += 1;
1506 }
1507 }
1508 writer.write_all(&buf[..wp])?;
1509 }
1510 Ok(())
1511}
1512
1513fn squeeze_single_stream(
1514 ch: u8,
1515 reader: &mut impl Read,
1516 writer: &mut impl Write,
1517) -> io::Result<()> {
1518 let pair = [ch, ch];
1523 let finder = memchr::memmem::Finder::new(&pair);
1524 let mut buf = vec![0u8; STREAM_BUF];
1525 let mut was_squeeze_char = false;
1526
1527 loop {
1528 let n = read_full(reader, &mut buf)?;
1529 if n == 0 {
1530 break;
1531 }
1532
1533 let ptr = buf.as_mut_ptr();
1534 let mut wp = 0;
1535 let mut i = 0;
1536
1537 if was_squeeze_char {
1540 while i < n && unsafe { *ptr.add(i) } == ch {
1541 i += 1;
1542 }
1543 if i >= n {
1544 continue;
1547 }
1548 }
1549
1550 loop {
1553 match finder.find(&buf[i..n]) {
1554 Some(offset) => {
1555 let copy_len = offset + 1; if copy_len > 0 && wp != i {
1558 unsafe {
1559 std::ptr::copy(ptr.add(i), ptr.add(wp), copy_len);
1560 }
1561 }
1562 wp += copy_len;
1563 i += offset + 1; while i < n && unsafe { *ptr.add(i) } == ch {
1566 i += 1;
1567 }
1568 if i >= n {
1569 was_squeeze_char = true;
1570 break;
1571 }
1572 }
1573 None => {
1574 let run_len = n - i;
1576 if run_len > 0 && wp != i {
1577 unsafe {
1578 std::ptr::copy(ptr.add(i), ptr.add(wp), run_len);
1579 }
1580 }
1581 wp += run_len;
1582 was_squeeze_char = n > 0 && unsafe { *ptr.add(n - 1) } == ch;
1584 break;
1585 }
1586 }
1587 }
1588
1589 writer.write_all(&buf[..wp])?;
1590 }
1591 Ok(())
1592}
1593
1594pub fn translate_owned(
1602 set1: &[u8],
1603 set2: &[u8],
1604 data: &mut [u8],
1605 writer: &mut impl Write,
1606) -> io::Result<()> {
1607 let table = build_translate_table(set1, set2);
1608
1609 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1611 if is_identity {
1612 return writer.write_all(data);
1613 }
1614
1615 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1617 if data.len() >= PARALLEL_THRESHOLD {
1618 let n_threads = rayon::current_num_threads().max(1);
1619 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1620 data.par_chunks_mut(chunk_size).for_each(|chunk| {
1621 translate_range_simd_inplace(chunk, lo, hi, offset);
1622 });
1623 } else {
1624 translate_range_simd_inplace(data, lo, hi, offset);
1625 }
1626 return writer.write_all(data);
1627 }
1628
1629 if data.len() >= PARALLEL_THRESHOLD {
1631 let n_threads = rayon::current_num_threads().max(1);
1632 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1633 data.par_chunks_mut(chunk_size).for_each(|chunk| {
1634 translate_inplace(chunk, &table);
1635 });
1636 } else {
1637 translate_inplace(data, &table);
1638 }
1639 writer.write_all(data)
1640}
1641
1642const SINGLE_WRITE_LIMIT: usize = 16 * 1024 * 1024;
1650
1651pub fn translate_mmap(
1660 set1: &[u8],
1661 set2: &[u8],
1662 data: &[u8],
1663 writer: &mut impl Write,
1664) -> io::Result<()> {
1665 let table = build_translate_table(set1, set2);
1666
1667 let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
1669 if is_identity {
1670 return writer.write_all(data);
1671 }
1672
1673 if let Some((lo, hi, offset)) = detect_range_offset(&table) {
1675 return translate_mmap_range(data, writer, lo, hi, offset);
1676 }
1677
1678 translate_mmap_table(data, writer, &table)
1680}
1681
1682fn translate_mmap_range(
1684 data: &[u8],
1685 writer: &mut impl Write,
1686 lo: u8,
1687 hi: u8,
1688 offset: i8,
1689) -> io::Result<()> {
1690 if data.len() >= PARALLEL_THRESHOLD {
1692 let mut buf = alloc_uninit_vec(data.len());
1693 let n_threads = rayon::current_num_threads().max(1);
1694 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1695
1696 data.par_chunks(chunk_size)
1698 .zip(buf.par_chunks_mut(chunk_size))
1699 .for_each(|(src_chunk, dst_chunk)| {
1700 translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
1701 });
1702
1703 return writer.write_all(&buf);
1704 }
1705
1706 if data.len() <= SINGLE_WRITE_LIMIT {
1708 let mut buf = alloc_uninit_vec(data.len());
1709 translate_range_simd(data, &mut buf, lo, hi, offset);
1710 return writer.write_all(&buf);
1711 }
1712 let mut buf = alloc_uninit_vec(BUF_SIZE);
1714 for chunk in data.chunks(BUF_SIZE) {
1715 translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
1716 writer.write_all(&buf[..chunk.len()])?;
1717 }
1718 Ok(())
1719}
1720
1721fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
1723 if data.len() >= PARALLEL_THRESHOLD {
1725 let mut buf = alloc_uninit_vec(data.len());
1726 let n_threads = rayon::current_num_threads().max(1);
1727 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1728
1729 data.par_chunks(chunk_size)
1730 .zip(buf.par_chunks_mut(chunk_size))
1731 .for_each(|(src_chunk, dst_chunk)| {
1732 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
1733 });
1734
1735 return writer.write_all(&buf);
1736 }
1737
1738 if data.len() <= SINGLE_WRITE_LIMIT {
1740 let mut buf = alloc_uninit_vec(data.len());
1741 translate_to(data, &mut buf, table);
1742 return writer.write_all(&buf);
1743 }
1744 let mut buf = alloc_uninit_vec(BUF_SIZE);
1745 for chunk in data.chunks(BUF_SIZE) {
1746 translate_to(chunk, &mut buf[..chunk.len()], table);
1747 writer.write_all(&buf[..chunk.len()])?;
1748 }
1749 Ok(())
1750}
1751
1752pub fn translate_squeeze_mmap(
1758 set1: &[u8],
1759 set2: &[u8],
1760 data: &[u8],
1761 writer: &mut impl Write,
1762) -> io::Result<()> {
1763 let table = build_translate_table(set1, set2);
1764 let squeeze_set = build_member_set(set2);
1765
1766 if data.len() >= PARALLEL_THRESHOLD {
1771 let mut translated = alloc_uninit_vec(data.len());
1773 let range_info = detect_range_offset(&table);
1774 let n_threads = rayon::current_num_threads().max(1);
1775 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1776
1777 if let Some((lo, hi, offset)) = range_info {
1778 data.par_chunks(chunk_size)
1779 .zip(translated.par_chunks_mut(chunk_size))
1780 .for_each(|(src_chunk, dst_chunk)| {
1781 translate_range_simd(
1782 src_chunk,
1783 &mut dst_chunk[..src_chunk.len()],
1784 lo,
1785 hi,
1786 offset,
1787 );
1788 });
1789 } else {
1790 data.par_chunks(chunk_size)
1791 .zip(translated.par_chunks_mut(chunk_size))
1792 .for_each(|(src_chunk, dst_chunk)| {
1793 translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
1794 });
1795 }
1796
1797 let mut last_squeezed: u16 = 256;
1801 let len = translated.len();
1802 let mut wp = 0;
1803 unsafe {
1804 let ptr = translated.as_mut_ptr();
1805 let mut i = 0;
1806 while i < len {
1807 let b = *ptr.add(i);
1808 if is_member(&squeeze_set, b) {
1809 if last_squeezed == b as u16 {
1810 i += 1;
1811 continue;
1812 }
1813 last_squeezed = b as u16;
1814 } else {
1815 last_squeezed = 256;
1816 }
1817 *ptr.add(wp) = b;
1818 wp += 1;
1819 i += 1;
1820 }
1821 }
1822 return writer.write_all(&translated[..wp]);
1823 }
1824
1825 if data.len() <= SINGLE_WRITE_LIMIT {
1827 let mut buf: Vec<u8> = Vec::with_capacity(data.len());
1828 let mut last_squeezed: u16 = 256;
1829 unsafe {
1830 buf.set_len(data.len());
1831 let outp: *mut u8 = buf.as_mut_ptr();
1832 let inp = data.as_ptr();
1833 let len = data.len();
1834 let mut wp = 0;
1835 let mut i = 0;
1836 while i < len {
1837 let translated = *table.get_unchecked(*inp.add(i) as usize);
1838 if is_member(&squeeze_set, translated) {
1839 if last_squeezed == translated as u16 {
1840 i += 1;
1841 continue;
1842 }
1843 last_squeezed = translated as u16;
1844 } else {
1845 last_squeezed = 256;
1846 }
1847 *outp.add(wp) = translated;
1848 wp += 1;
1849 i += 1;
1850 }
1851 buf.set_len(wp);
1852 }
1853 return writer.write_all(&buf);
1854 }
1855
1856 let buf_size = data.len().min(BUF_SIZE);
1858 let mut buf = vec![0u8; buf_size];
1859 let mut last_squeezed: u16 = 256;
1860
1861 for chunk in data.chunks(buf_size) {
1862 translate_to(chunk, &mut buf[..chunk.len()], &table);
1863 let mut wp = 0;
1864 unsafe {
1865 let ptr = buf.as_mut_ptr();
1866 for i in 0..chunk.len() {
1867 let b = *ptr.add(i);
1868 if is_member(&squeeze_set, b) {
1869 if last_squeezed == b as u16 {
1870 continue;
1871 }
1872 last_squeezed = b as u16;
1873 } else {
1874 last_squeezed = 256;
1875 }
1876 *ptr.add(wp) = b;
1877 wp += 1;
1878 }
1879 }
1880 writer.write_all(&buf[..wp])?;
1881 }
1882 Ok(())
1883}
1884
1885pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
1891 if delete_chars.len() == 1 {
1892 return delete_single_char_mmap(delete_chars[0], data, writer);
1893 }
1894 if delete_chars.len() <= 3 {
1895 return delete_multi_memchr_mmap(delete_chars, data, writer);
1896 }
1897
1898 if let Some((lo, hi)) = detect_delete_range(delete_chars) {
1900 return delete_range_mmap(data, writer, lo, hi);
1901 }
1902
1903 let member = build_member_set(delete_chars);
1904
1905 if data.len() >= PARALLEL_THRESHOLD {
1909 let n_threads = rayon::current_num_threads().max(1);
1910 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1911
1912 let mut outbuf = alloc_uninit_vec(data.len());
1914 let chunk_lens: Vec<usize> = data
1915 .par_chunks(chunk_size)
1916 .zip(outbuf.par_chunks_mut(chunk_size))
1917 .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
1918 .collect();
1919
1920 let mut write_pos = 0;
1924 let mut src_offset = 0;
1925 for &clen in &chunk_lens {
1926 if clen > 0 && src_offset != write_pos {
1927 unsafe {
1928 std::ptr::copy(
1929 outbuf.as_ptr().add(src_offset),
1930 outbuf.as_mut_ptr().add(write_pos),
1931 clen,
1932 );
1933 }
1934 }
1935 write_pos += clen;
1936 src_offset += chunk_size;
1937 }
1938
1939 return writer.write_all(&outbuf[..write_pos]);
1940 }
1941
1942 if data.len() <= SINGLE_WRITE_LIMIT {
1944 let mut outbuf = alloc_uninit_vec(data.len());
1945 let out_pos = delete_chunk_bitset_into(data, &member, &mut outbuf);
1946 return writer.write_all(&outbuf[..out_pos]);
1947 }
1948
1949 let buf_size = data.len().min(BUF_SIZE);
1951 let mut outbuf = alloc_uninit_vec(buf_size);
1952
1953 for chunk in data.chunks(buf_size) {
1954 let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
1955 writer.write_all(&outbuf[..out_pos])?;
1956 }
1957 Ok(())
1958}
1959
1960fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
1962 if data.len() >= PARALLEL_THRESHOLD {
1964 let n_threads = rayon::current_num_threads().max(1);
1965 let chunk_size = (data.len() / n_threads).max(32 * 1024);
1966
1967 let results: Vec<Vec<u8>> = data
1968 .par_chunks(chunk_size)
1969 .map(|chunk| {
1970 let mut out = alloc_uninit_vec(chunk.len());
1971 let wp = delete_range_chunk(chunk, &mut out, lo, hi);
1972 unsafe { out.set_len(wp) };
1973 out
1974 })
1975 .collect();
1976
1977 let slices: Vec<std::io::IoSlice> = results
1978 .iter()
1979 .filter(|r| !r.is_empty())
1980 .map(|r| std::io::IoSlice::new(r))
1981 .collect();
1982 return write_ioslices(writer, &slices);
1983 }
1984
1985 if data.len() <= SINGLE_WRITE_LIMIT {
1987 let mut outbuf = alloc_uninit_vec(data.len());
1988 let wp = delete_range_chunk(data, &mut outbuf, lo, hi);
1989 return writer.write_all(&outbuf[..wp]);
1990 }
1991
1992 let mut outbuf = alloc_uninit_vec(BUF_SIZE);
1994 for chunk in data.chunks(BUF_SIZE) {
1995 let wp = delete_range_chunk(chunk, &mut outbuf, lo, hi);
1996 writer.write_all(&outbuf[..wp])?;
1997 }
1998 Ok(())
1999}
2000
2001#[inline]
2004fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
2005 let len = chunk.len();
2006 let mut out_pos = 0;
2007 let mut i = 0;
2008
2009 while i + 8 <= len {
2010 unsafe {
2011 let b0 = *chunk.get_unchecked(i);
2012 let b1 = *chunk.get_unchecked(i + 1);
2013 let b2 = *chunk.get_unchecked(i + 2);
2014 let b3 = *chunk.get_unchecked(i + 3);
2015 let b4 = *chunk.get_unchecked(i + 4);
2016 let b5 = *chunk.get_unchecked(i + 5);
2017 let b6 = *chunk.get_unchecked(i + 6);
2018 let b7 = *chunk.get_unchecked(i + 7);
2019
2020 *outbuf.get_unchecked_mut(out_pos) = b0;
2021 out_pos += !is_member(member, b0) as usize;
2022 *outbuf.get_unchecked_mut(out_pos) = b1;
2023 out_pos += !is_member(member, b1) as usize;
2024 *outbuf.get_unchecked_mut(out_pos) = b2;
2025 out_pos += !is_member(member, b2) as usize;
2026 *outbuf.get_unchecked_mut(out_pos) = b3;
2027 out_pos += !is_member(member, b3) as usize;
2028 *outbuf.get_unchecked_mut(out_pos) = b4;
2029 out_pos += !is_member(member, b4) as usize;
2030 *outbuf.get_unchecked_mut(out_pos) = b5;
2031 out_pos += !is_member(member, b5) as usize;
2032 *outbuf.get_unchecked_mut(out_pos) = b6;
2033 out_pos += !is_member(member, b6) as usize;
2034 *outbuf.get_unchecked_mut(out_pos) = b7;
2035 out_pos += !is_member(member, b7) as usize;
2036 }
2037 i += 8;
2038 }
2039
2040 while i < len {
2041 unsafe {
2042 let b = *chunk.get_unchecked(i);
2043 *outbuf.get_unchecked_mut(out_pos) = b;
2044 out_pos += !is_member(member, b) as usize;
2045 }
2046 i += 1;
2047 }
2048
2049 out_pos
2050}
2051
2052fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2053 let mut iov: Vec<std::io::IoSlice> = Vec::new();
2060 let mut last = 0;
2061 for pos in memchr::memchr_iter(ch, data) {
2062 if pos > last {
2063 iov.push(std::io::IoSlice::new(&data[last..pos]));
2064 }
2065 last = pos + 1;
2066 }
2067 if last < data.len() {
2068 iov.push(std::io::IoSlice::new(&data[last..]));
2069 }
2070 write_ioslices(writer, &iov)
2071}
2072
2073fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2074 let c0 = chars[0];
2075 let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
2076 let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
2077 let is_three = chars.len() >= 3;
2078
2079 let mut iov: Vec<std::io::IoSlice> = Vec::new();
2082 let mut last = 0;
2083 if is_three {
2084 for pos in memchr::memchr3_iter(c0, c1, c2, data) {
2085 if pos > last {
2086 iov.push(std::io::IoSlice::new(&data[last..pos]));
2087 }
2088 last = pos + 1;
2089 }
2090 } else {
2091 for pos in memchr::memchr2_iter(c0, c1, data) {
2092 if pos > last {
2093 iov.push(std::io::IoSlice::new(&data[last..pos]));
2094 }
2095 last = pos + 1;
2096 }
2097 }
2098 if last < data.len() {
2099 iov.push(std::io::IoSlice::new(&data[last..]));
2100 }
2101 write_ioslices(writer, &iov)
2102}
2103
2104pub fn delete_squeeze_mmap(
2109 delete_chars: &[u8],
2110 squeeze_chars: &[u8],
2111 data: &[u8],
2112 writer: &mut impl Write,
2113) -> io::Result<()> {
2114 let delete_set = build_member_set(delete_chars);
2115 let squeeze_set = build_member_set(squeeze_chars);
2116
2117 if data.len() <= SINGLE_WRITE_LIMIT {
2119 let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2120 let mut last_squeezed: u16 = 256;
2121 unsafe {
2122 outbuf.set_len(data.len());
2123 let outp: *mut u8 = outbuf.as_mut_ptr();
2124 let inp = data.as_ptr();
2125 let len = data.len();
2126 let mut out_pos = 0;
2127 let mut i = 0;
2128 while i < len {
2129 let b = *inp.add(i);
2130 if is_member(&delete_set, b) {
2131 i += 1;
2132 continue;
2133 }
2134 if is_member(&squeeze_set, b) {
2135 if last_squeezed == b as u16 {
2136 i += 1;
2137 continue;
2138 }
2139 last_squeezed = b as u16;
2140 } else {
2141 last_squeezed = 256;
2142 }
2143 *outp.add(out_pos) = b;
2144 out_pos += 1;
2145 i += 1;
2146 }
2147 outbuf.set_len(out_pos);
2148 }
2149 return writer.write_all(&outbuf);
2150 }
2151
2152 let buf_size = data.len().min(BUF_SIZE);
2154 let mut outbuf = vec![0u8; buf_size];
2155 let mut last_squeezed: u16 = 256;
2156
2157 for chunk in data.chunks(buf_size) {
2158 let mut out_pos = 0;
2159 for &b in chunk {
2160 if is_member(&delete_set, b) {
2161 continue;
2162 }
2163 if is_member(&squeeze_set, b) {
2164 if last_squeezed == b as u16 {
2165 continue;
2166 }
2167 last_squeezed = b as u16;
2168 } else {
2169 last_squeezed = 256;
2170 }
2171 unsafe {
2172 *outbuf.get_unchecked_mut(out_pos) = b;
2173 }
2174 out_pos += 1;
2175 }
2176 writer.write_all(&outbuf[..out_pos])?;
2177 }
2178 Ok(())
2179}
2180
2181pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2187 if squeeze_chars.len() == 1 {
2188 return squeeze_single_mmap(squeeze_chars[0], data, writer);
2189 }
2190 if squeeze_chars.len() == 2 {
2191 return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
2192 }
2193 if squeeze_chars.len() == 3 {
2194 return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
2195 }
2196
2197 let member = build_member_set(squeeze_chars);
2198
2199 if data.len() >= PARALLEL_THRESHOLD {
2201 let n_threads = rayon::current_num_threads().max(1);
2202 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2203
2204 let results: Vec<Vec<u8>> = data
2205 .par_chunks(chunk_size)
2206 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2207 .collect();
2208
2209 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2214 for (idx, result) in results.iter().enumerate() {
2215 if result.is_empty() {
2216 continue;
2217 }
2218 if idx > 0 {
2219 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2221 if is_member(&member, prev_last) {
2222 let skip = result.iter().take_while(|&&b| b == prev_last).count();
2224 if skip < result.len() {
2225 slices.push(std::io::IoSlice::new(&result[skip..]));
2226 }
2227 continue;
2228 }
2229 }
2230 }
2231 slices.push(std::io::IoSlice::new(result));
2232 }
2233 return write_ioslices(writer, &slices);
2234 }
2235
2236 if data.len() <= SINGLE_WRITE_LIMIT {
2238 let mut outbuf: Vec<u8> = Vec::with_capacity(data.len());
2239 let mut last_squeezed: u16 = 256;
2240 let len = data.len();
2241 let mut wp = 0;
2242 let mut i = 0;
2243
2244 unsafe {
2245 outbuf.set_len(data.len());
2246 let inp = data.as_ptr();
2247 let outp: *mut u8 = outbuf.as_mut_ptr();
2248
2249 while i < len {
2250 let b = *inp.add(i);
2251 if is_member(&member, b) {
2252 if last_squeezed != b as u16 {
2253 *outp.add(wp) = b;
2254 wp += 1;
2255 last_squeezed = b as u16;
2256 }
2257 i += 1;
2258 while i < len && *inp.add(i) == b {
2259 i += 1;
2260 }
2261 } else {
2262 last_squeezed = 256;
2263 *outp.add(wp) = b;
2264 wp += 1;
2265 i += 1;
2266 }
2267 }
2268 outbuf.set_len(wp);
2269 }
2270 return writer.write_all(&outbuf);
2271 }
2272
2273 let buf_size = data.len().min(BUF_SIZE);
2275 let mut outbuf = vec![0u8; buf_size];
2276 let mut last_squeezed: u16 = 256;
2277
2278 for chunk in data.chunks(buf_size) {
2279 let len = chunk.len();
2280 let mut wp = 0;
2281 let mut i = 0;
2282
2283 unsafe {
2284 let inp = chunk.as_ptr();
2285 let outp = outbuf.as_mut_ptr();
2286
2287 while i < len {
2288 let b = *inp.add(i);
2289 if is_member(&member, b) {
2290 if last_squeezed != b as u16 {
2291 *outp.add(wp) = b;
2292 wp += 1;
2293 last_squeezed = b as u16;
2294 }
2295 i += 1;
2296 while i < len && *inp.add(i) == b {
2297 i += 1;
2298 }
2299 } else {
2300 last_squeezed = 256;
2301 *outp.add(wp) = b;
2302 wp += 1;
2303 i += 1;
2304 }
2305 }
2306 }
2307 writer.write_all(&outbuf[..wp])?;
2308 }
2309 Ok(())
2310}
2311
2312fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
2314 let len = chunk.len();
2315 let mut out = Vec::with_capacity(len);
2316 let mut last_squeezed: u16 = 256;
2317 let mut i = 0;
2318
2319 unsafe {
2320 out.set_len(len);
2321 let inp = chunk.as_ptr();
2322 let outp: *mut u8 = out.as_mut_ptr();
2323 let mut wp = 0;
2324
2325 while i < len {
2326 let b = *inp.add(i);
2327 if is_member(member, b) {
2328 if last_squeezed != b as u16 {
2329 *outp.add(wp) = b;
2330 wp += 1;
2331 last_squeezed = b as u16;
2332 }
2333 i += 1;
2334 while i < len && *inp.add(i) == b {
2335 i += 1;
2336 }
2337 } else {
2338 last_squeezed = 256;
2339 *outp.add(wp) = b;
2340 wp += 1;
2341 i += 1;
2342 }
2343 }
2344 out.set_len(wp);
2345 }
2346 out
2347}
2348
2349fn squeeze_multi_mmap<const N: usize>(
2350 chars: &[u8],
2351 data: &[u8],
2352 writer: &mut impl Write,
2353) -> io::Result<()> {
2354 if data.len() >= PARALLEL_THRESHOLD {
2356 let member = build_member_set(chars);
2357 let n_threads = rayon::current_num_threads().max(1);
2358 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2359
2360 let results: Vec<Vec<u8>> = data
2361 .par_chunks(chunk_size)
2362 .map(|chunk| squeeze_chunk_bitset(chunk, &member))
2363 .collect();
2364
2365 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2367 for (idx, result) in results.iter().enumerate() {
2368 if result.is_empty() {
2369 continue;
2370 }
2371 if idx > 0 {
2372 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2373 if is_member(&member, prev_last) {
2374 let skip = result.iter().take_while(|&&b| b == prev_last).count();
2375 if skip < result.len() {
2376 slices.push(std::io::IoSlice::new(&result[skip..]));
2377 }
2378 continue;
2379 }
2380 }
2381 }
2382 slices.push(std::io::IoSlice::new(result));
2383 }
2384 return write_ioslices(writer, &slices);
2385 }
2386
2387 let buf_size = data.len().min(BUF_SIZE);
2388 let mut outbuf = vec![0u8; buf_size];
2389 let mut wp = 0;
2390 let mut last_squeezed: u16 = 256;
2391 let mut cursor = 0;
2392
2393 macro_rules! find_next {
2394 ($data:expr) => {
2395 if N == 2 {
2396 memchr::memchr2(chars[0], chars[1], $data)
2397 } else {
2398 memchr::memchr3(chars[0], chars[1], chars[2], $data)
2399 }
2400 };
2401 }
2402
2403 macro_rules! flush_and_copy {
2404 ($src:expr, $len:expr) => {
2405 if wp + $len > buf_size {
2406 writer.write_all(&outbuf[..wp])?;
2407 wp = 0;
2408 }
2409 if $len > buf_size {
2410 writer.write_all($src)?;
2411 } else {
2412 outbuf[wp..wp + $len].copy_from_slice($src);
2413 wp += $len;
2414 }
2415 };
2416 }
2417
2418 while cursor < data.len() {
2419 match find_next!(&data[cursor..]) {
2420 Some(offset) => {
2421 let pos = cursor + offset;
2422 let b = data[pos];
2423 if pos > cursor {
2424 let span = pos - cursor;
2425 flush_and_copy!(&data[cursor..pos], span);
2426 last_squeezed = 256;
2427 }
2428 if last_squeezed != b as u16 {
2429 if wp >= buf_size {
2430 writer.write_all(&outbuf[..wp])?;
2431 wp = 0;
2432 }
2433 outbuf[wp] = b;
2434 wp += 1;
2435 last_squeezed = b as u16;
2436 }
2437 let mut skip = pos + 1;
2438 while skip < data.len() && data[skip] == b {
2439 skip += 1;
2440 }
2441 cursor = skip;
2442 }
2443 None => {
2444 let remaining = data.len() - cursor;
2445 flush_and_copy!(&data[cursor..], remaining);
2446 break;
2447 }
2448 }
2449 }
2450 if wp > 0 {
2451 writer.write_all(&outbuf[..wp])?;
2452 }
2453 Ok(())
2454}
2455
2456fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
2457 if data.is_empty() {
2458 return Ok(());
2459 }
2460
2461 if memchr::memmem::find(data, &[ch, ch]).is_none() {
2462 return writer.write_all(data);
2463 }
2464
2465 if data.len() >= PARALLEL_THRESHOLD {
2467 let n_threads = rayon::current_num_threads().max(1);
2468 let chunk_size = (data.len() / n_threads).max(32 * 1024);
2469
2470 let results: Vec<Vec<u8>> = data
2471 .par_chunks(chunk_size)
2472 .map(|chunk| {
2473 let mut out = Vec::with_capacity(chunk.len());
2474 let mut cursor = 0;
2475 while cursor < chunk.len() {
2476 match memchr::memchr(ch, &chunk[cursor..]) {
2477 Some(offset) => {
2478 let pos = cursor + offset;
2479 if pos > cursor {
2480 out.extend_from_slice(&chunk[cursor..pos]);
2481 }
2482 out.push(ch);
2483 cursor = pos + 1;
2484 while cursor < chunk.len() && chunk[cursor] == ch {
2485 cursor += 1;
2486 }
2487 }
2488 None => {
2489 out.extend_from_slice(&chunk[cursor..]);
2490 break;
2491 }
2492 }
2493 }
2494 out
2495 })
2496 .collect();
2497
2498 let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
2501 for (idx, result) in results.iter().enumerate() {
2502 if result.is_empty() {
2503 continue;
2504 }
2505 if idx > 0 {
2506 if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
2507 if prev_last == ch {
2508 let skip = result.iter().take_while(|&&b| b == ch).count();
2510 if skip < result.len() {
2511 slices.push(std::io::IoSlice::new(&result[skip..]));
2512 }
2513 continue;
2514 }
2515 }
2516 }
2517 slices.push(std::io::IoSlice::new(result));
2518 }
2519 return write_ioslices(writer, &slices);
2520 }
2521
2522 let buf_size = data.len().min(BUF_SIZE);
2523 let mut outbuf = vec![0u8; buf_size];
2524 let len = data.len();
2525 let mut wp = 0;
2526 let mut cursor = 0;
2527
2528 while cursor < len {
2529 match memchr::memchr(ch, &data[cursor..]) {
2530 Some(offset) => {
2531 let pos = cursor + offset;
2532 let gap = pos - cursor;
2533 if gap > 0 {
2534 if wp + gap > buf_size {
2535 writer.write_all(&outbuf[..wp])?;
2536 wp = 0;
2537 }
2538 if gap > buf_size {
2539 writer.write_all(&data[cursor..pos])?;
2540 } else {
2541 outbuf[wp..wp + gap].copy_from_slice(&data[cursor..pos]);
2542 wp += gap;
2543 }
2544 }
2545 if wp >= buf_size {
2546 writer.write_all(&outbuf[..wp])?;
2547 wp = 0;
2548 }
2549 outbuf[wp] = ch;
2550 wp += 1;
2551 cursor = pos + 1;
2552 while cursor < len && data[cursor] == ch {
2553 cursor += 1;
2554 }
2555 }
2556 None => {
2557 let remaining = len - cursor;
2558 if remaining > 0 {
2559 if wp + remaining > buf_size {
2560 writer.write_all(&outbuf[..wp])?;
2561 wp = 0;
2562 }
2563 if remaining > buf_size {
2564 writer.write_all(&data[cursor..])?;
2565 } else {
2566 outbuf[wp..wp + remaining].copy_from_slice(&data[cursor..]);
2567 wp += remaining;
2568 }
2569 }
2570 break;
2571 }
2572 }
2573 }
2574
2575 if wp > 0 {
2576 writer.write_all(&outbuf[..wp])?;
2577 }
2578 Ok(())
2579}