1pub mod error;
2pub mod log;
3
4#[cfg(feature = "verbose-log")]
5use std::sync::atomic::{AtomicUsize, Ordering};
6use std::{
7 fs,
8 io::{self, BufWriter, Write},
9 path::Path,
10};
11
12pub use error::*;
13use memchr::{memchr, memmem::Finder};
14use snafu::ResultExt;
15use tempfile::NamedTempFile;
16
17#[cfg(feature = "verbose-log")]
18use crate::log::VerboseLogger;
19use crate::log::{DecodeLogger, NoOpLogger};
20
21const SMALL_FILE_THRESHOLD: u64 = 256 * 1024;
22const IO_BUF_SIZE: usize = 64 * 1024;
23const URL_CHAR_BITMAP: [u32; 8] = gen_url_bitmap(b"-+&@#/%?=~_|!:,.;");
24const URL_END_CHAR_BITMAP: [u32; 8] = gen_url_bitmap(b"-+&@#/%=~_|");
25const HEX_MAP: [u8; 256] = gen_hex_map();
26const HEX_INVALID: u8 = 0xFF;
27
28const fn gen_url_bitmap(symbols: &[u8]) -> [u32; 8] {
29 let mut bitmap = [0u32; 8];
30 let mut c = b'0';
31 while c <= b'9' {
32 let idx = c as usize;
33 bitmap[idx >> 5] |= 1u32 << (idx & 31);
34 c += 1;
35 }
36 let mut c = b'A';
37 while c <= b'Z' {
38 let idx = c as usize;
39 bitmap[idx >> 5] |= 1u32 << (idx & 31);
40 c += 1;
41 }
42 let mut c = b'a';
43 while c <= b'z' {
44 let idx = c as usize;
45 bitmap[idx >> 5] |= 1u32 << (idx & 31);
46 c += 1;
47 }
48 let mut i = 0;
49 while i < symbols.len() {
50 let idx = symbols[i] as usize;
51 bitmap[idx >> 5] |= 1u32 << (idx & 31);
52 i += 1;
53 }
54 bitmap
55}
56
57const fn gen_hex_map() -> [u8; 256] {
58 let mut map = [HEX_INVALID; 256];
59 let mut i = 0;
60 while i < 10 {
61 map[(b'0' + i) as usize] = i;
62 i += 1;
63 }
64 let mut i = 0;
65 while i < 6 {
66 map[(b'a' + i) as usize] = 10 + i;
67 map[(b'A' + i) as usize] = 10 + i;
68 i += 1;
69 }
70 map
71}
72
73#[inline]
74#[cold]
75fn cold() {}
76
77#[inline]
78fn likely(b: bool) -> bool {
79 if !b {
80 cold()
81 }
82 b
83}
84
85#[inline(always)]
86fn decode_hex_pair(h1: u8, h2: u8) -> Option<u8> {
87 let v1 = unsafe { *HEX_MAP.get_unchecked(h1 as usize) };
88 let v2 = unsafe { *HEX_MAP.get_unchecked(h2 as usize) };
89 if likely((v1 | v2) != HEX_INVALID) {
90 Some((v1 << 4) | v2)
91 } else {
92 None
93 }
94}
95
96#[inline(always)]
97fn is_url_char(byte: u8) -> bool {
98 let idx = byte as usize;
99 unsafe { (URL_CHAR_BITMAP.get_unchecked(idx >> 5) >> (idx & 31)) & 1 == 1 }
100}
101
102#[inline(always)]
103fn is_url_end_char(byte: u8) -> bool {
104 let idx = byte as usize;
105 unsafe { (URL_END_CHAR_BITMAP.get_unchecked(idx >> 5) >> (idx & 31)) & 1 == 1 }
106}
107
108#[inline(always)]
109fn trim_url_end(slice: &[u8]) -> (&[u8], &[u8]) {
110 let mut end = slice.len();
111 while end > 0 {
112 if is_url_end_char(unsafe { *slice.get_unchecked(end - 1) }) {
113 break;
114 }
115 end -= 1;
116 }
117 unsafe { (slice.get_unchecked(..end), slice.get_unchecked(end..)) }
118}
119
120macro_rules! decode {
126 ($func:ident($($args:expr),*), $verbose:expr) => {{
127 if $verbose {
128 #[cfg(feature = "verbose-log")]
129 {
130 let mut logger = VerboseLogger::new();
131 $func($($args),*, &mut logger)
132 }
133 #[cfg(not(feature = "verbose-log"))]
134 {
135 $func($($args),*)
136 }
137 } else {
138 #[cfg(feature = "verbose-log")]
139 {
140 let mut logger = NoOpLogger;
141 $func($($args),*, &mut logger)
142 }
143 #[cfg(not(feature = "verbose-log"))]
144 {
145 $func($($args),*)
146 }
147 }
148 }};
149}
150
151pub fn decode_in_place(
156 data: &mut [u8],
157 escape_space: bool,
158 #[cfg(feature = "verbose-log")] logger: &mut impl DecodeLogger,
159) -> usize {
160 if escape_space {
161 decode_in_place_inner::<true>(
162 data,
163 #[cfg(feature = "verbose-log")]
164 logger,
165 )
166 } else {
167 decode_in_place_inner::<false>(
168 data,
169 #[cfg(feature = "verbose-log")]
170 logger,
171 )
172 }
173}
174
175#[inline(always)]
176fn decode_in_place_inner<const ESCAPE_SPACE: bool>(
177 data: &mut [u8],
178 #[cfg(feature = "verbose-log")] logger: &mut impl DecodeLogger,
179) -> usize {
180 let mut r = 0;
181 let mut w = 0;
182 let len = data.len();
183 let finder = Finder::new(b"http");
184
185 while r < len {
186 if let Some(match_idx) = finder.find(&data[r..]) {
187 let start = r + match_idx;
188
189 let is_http = data[start..].starts_with(b"http://");
190 let is_https = data[start..].starts_with(b"https://");
191
192 if is_http || is_https {
193 if start > r {
195 let copy_len = start - r;
196 if w != r {
197 data.copy_within(r..start, w);
198 }
199 w += copy_len;
200 }
201
202 let prefix_len = if is_https { 8 } else { 7 };
204 let mut end = start + prefix_len;
205 while end < len && is_url_char(data[end]) {
206 end += 1;
207 }
208
209 let mut valid_end = end;
210 while valid_end > start {
211 if is_url_end_char(unsafe { *data.get_unchecked(valid_end - 1) }) {
212 break;
213 }
214 valid_end -= 1;
215 }
216
217 w = decode_url_in_place_indices::<ESCAPE_SPACE>(
219 data,
220 start,
221 valid_end,
222 w,
223 #[cfg(feature = "verbose-log")]
224 logger,
225 );
226
227 let suffix_len = end - valid_end;
229 if suffix_len > 0 {
230 if w != valid_end {
231 data.copy_within(valid_end..end, w);
232 }
233 w += suffix_len;
234 }
235
236 r = end;
237 } else {
238 let copy_len = start + 4 - r;
240 if w != r {
241 data.copy_within(r..start + 4, w);
242 }
243 w += copy_len;
244 r = start + 4;
245 }
246 } else {
247 if r < len {
249 let copy_len = len - r;
250 if w != r {
251 data.copy_within(r..len, w);
252 }
253 w += copy_len;
254 }
255 break;
256 }
257 }
258 w
259}
260
261#[inline(always)]
262fn decode_url_in_place_indices<const ESCAPE_SPACE: bool>(
263 data: &mut [u8],
264 src_start: usize,
265 src_end: usize,
266 mut dst: usize,
267 #[cfg(feature = "verbose-log")] logger: &mut impl DecodeLogger,
268) -> usize {
269 #[cfg(not(feature = "verbose-log"))]
270 let mut logger = NoOpLogger;
271 logger.clear();
272
273 let mut i = src_start;
274 let first_pct = match memchr(b'%', &data[i..src_end]) {
275 Some(idx) => idx,
276 None => {
277 let len = src_end - i;
278 logger.log_orig_slice(&data[i..src_end]);
279 logger.log_res_slice(&data[i..src_end]);
280 if dst != i {
281 data.copy_within(i..src_end, dst);
282 }
283 return dst + len;
284 }
285 };
286
287 if first_pct > 0 {
288 logger.log_orig_slice(&data[i..i + first_pct]);
289 logger.log_res_slice(&data[i..i + first_pct]);
290 if dst != i {
291 data.copy_within(i..i + first_pct, dst);
292 }
293 dst += first_pct;
294 i += first_pct;
295 }
296
297 let mut literal_start = i;
298 let mut changed = false;
299
300 while i < src_end {
301 if data[i] == b'%' && i + 2 < src_end {
302 let h1 = data[i + 1];
303 let h2 = data[i + 2];
304 if let Some(decoded) = decode_hex_pair(h1, h2) {
305 if ESCAPE_SPACE && decoded == b' ' {
306 i += 3;
307 continue;
308 }
309
310 changed = true;
311 if i > literal_start {
312 let len = i - literal_start;
313 logger.log_orig_slice(&data[literal_start..i]);
314 logger.log_res_slice(&data[literal_start..i]);
315 if dst != literal_start {
316 data.copy_within(literal_start..i, dst);
317 }
318 dst += len;
319 }
320
321 logger.log_orig(b'%');
322 logger.log_orig(h1);
323 logger.log_orig(h2);
324 logger.log_res(decoded);
325
326 data[dst] = decoded;
327 dst += 1;
328 i += 3;
329 literal_start = i;
330 continue;
331 } else {
332 i += 1;
333 continue;
334 }
335 }
336 if data[i] == b'%' {
337 i += 1;
338 } else {
339 match memchr(b'%', &data[i..src_end]) {
340 Some(offset) => i += offset,
341 None => i = src_end,
342 }
343 }
344 }
345
346 if literal_start < src_end {
347 let len = src_end - literal_start;
348 logger.log_orig_slice(&data[literal_start..src_end]);
349 logger.log_res_slice(&data[literal_start..src_end]);
350 if dst != literal_start {
351 data.copy_within(literal_start..src_end, dst);
352 }
353 dst += len;
354 }
355
356 logger.print_if_changed(changed);
357 dst
358}
359
360#[cfg(not(feature = "safe"))]
361fn decode_file_in_place(
362 path: &Path,
363 escape_space: bool,
364 #[allow(unused)] verbose: bool,
365 #[cfg(feature = "verbose-log")] p_counter: &AtomicUsize,
366 #[cfg(feature = "verbose-log")] c_counter: &AtomicUsize,
367) -> Result<()> {
368 use std::fs::{self, OpenOptions};
369
370 let metadata = fs::metadata(path).context(ReadInputSnafu)?;
371 let file_len = metadata.len();
372
373 if file_len == 0 {
374 #[cfg(feature = "verbose-log")]
375 p_counter.fetch_add(1, Ordering::Relaxed);
376 return Ok(());
377 }
378
379 #[allow(unused)]
380 let changed = if file_len < SMALL_FILE_THRESHOLD {
381 let mut buf = fs::read(path).context(ReadInputSnafu)?;
382 let new_len = decode!(decode_in_place(&mut buf, escape_space), verbose);
383 let is_changed = new_len < file_len as usize;
384
385 if is_changed {
386 fs::write(path, &buf[..new_len]).context(WriteOutputSnafu)?;
387 }
388 is_changed
389 } else {
390 let file = OpenOptions::new()
391 .read(true)
392 .write(true)
393 .open(path)
394 .context(OpenInputSnafu { path })?;
395
396 let mut mmap = unsafe {
397 memmap2::MmapOptions::new()
398 .map_mut(&file)
399 .context(ReadInputSnafu)?
400 };
401
402 #[cfg(unix)]
403 mmap.advise(memmap2::Advice::Sequential);
404
405 let new_len = decode!(decode_in_place(&mut mmap, escape_space), verbose);
406 let is_changed = new_len < file_len as usize;
407
408 if is_changed {
409 mmap.flush().context(WriteOutputSnafu)?;
410 drop(mmap);
411 file.set_len(new_len as u64).context(WriteOutputSnafu)?;
412 }
413 is_changed
414 };
415
416 #[cfg(feature = "verbose-log")]
417 {
418 p_counter.fetch_add(1, Ordering::Relaxed);
419 if changed {
420 c_counter.fetch_add(1, Ordering::Relaxed);
421 if verbose {
422 println!("Processed File: {:?}", path);
423 }
424 }
425 }
426
427 Ok(())
428}
429
430pub fn decode_slice_to_writer<W: Write>(
433 input: &[u8],
434 writer: &mut W,
435 escape_space: bool,
436 #[cfg(feature = "verbose-log")] logger: &mut impl DecodeLogger,
437) -> io::Result<bool> {
438 let mut pos = 0;
439 let len = input.len();
440 let mut changed = false;
441 let finder = Finder::new(b"http");
442
443 while pos < len {
444 if let Some(match_idx) = finder.find(&input[pos..]) {
445 let start = pos + match_idx;
446
447 let is_http = input[start..].starts_with(b"http://");
448 let is_https = input[start..].starts_with(b"https://");
449
450 if is_http || is_https {
451 if start > pos {
453 writer.write_all(&input[pos..start])?;
454 }
455
456 let prefix_len = if is_https { 8 } else { 7 };
458 let mut end = start + prefix_len;
459 while end < len && is_url_char(input[end]) {
460 end += 1;
461 }
462
463 let raw_url = &input[start..end];
464 let (valid_url, suffix) = trim_url_end(raw_url);
465
466 #[cfg(feature = "verbose-log")]
468 let url_changed = decode_url_to_writer(valid_url, writer, escape_space, logger)?;
469 #[cfg(not(feature = "verbose-log"))]
470 let url_changed = decode_url_to_writer(valid_url, writer, escape_space)?;
471 if url_changed {
472 changed = true;
473 }
474
475 if !suffix.is_empty() {
477 writer.write_all(suffix)?;
478 }
479
480 pos = end;
481 } else {
482 writer.write_all(&input[pos..start + 4])?;
484 pos = start + 4;
485 }
486 } else {
487 if pos < len {
489 writer.write_all(&input[pos..])?;
490 }
491 break;
492 }
493 }
494 Ok(changed)
495}
496
497#[inline(always)]
498pub fn decode_url_to_writer<W: Write>(
499 url: &[u8],
500 writer: &mut W,
501 escape_space: bool,
502 #[cfg(feature = "verbose-log")] logger: &mut impl DecodeLogger,
503) -> io::Result<bool> {
504 if escape_space {
506 decode_inner::<true, W>(
507 url,
508 writer,
509 #[cfg(feature = "verbose-log")]
510 logger,
511 )
512 } else {
513 decode_inner::<false, W>(
514 url,
515 writer,
516 #[cfg(feature = "verbose-log")]
517 logger,
518 )
519 }
520}
521
522#[inline(always)]
523fn decode_inner<const ESCAPE_SPACE: bool, W: Write>(
524 url: &[u8],
525 writer: &mut W,
526 #[cfg(feature = "verbose-log")] logger: &mut impl DecodeLogger,
527) -> io::Result<bool> {
528 #[cfg(not(feature = "verbose-log"))]
529 let mut logger = NoOpLogger;
530 logger.clear();
531
532 let first_pct = match memchr(b'%', url) {
533 Some(idx) => idx,
534 None => {
535 writer.write_all(url)?;
536 logger.log_orig_slice(url);
537 logger.log_res_slice(url);
538 return Ok(false);
539 }
540 };
541
542 if first_pct > 0 {
543 writer.write_all(&url[..first_pct])?;
544 logger.log_orig_slice(&url[..first_pct]);
545 logger.log_res_slice(&url[..first_pct]);
546 }
547
548 let mut i = first_pct;
549 let len = url.len();
550 let mut changed = false;
551 let mut literal_start = i; while i < len {
554 if url[i] == b'%' && i + 2 < len {
555 let h1 = url[i + 1];
556 let h2 = url[i + 2];
557 if let Some(decoded) = decode_hex_pair(h1, h2) {
558 if ESCAPE_SPACE && decoded == b' ' {
559 i += 3;
560 continue;
561 }
562
563 changed = true;
564 if i > literal_start {
565 writer.write_all(&url[literal_start..i])?;
566 logger.log_orig_slice(&url[literal_start..i]);
567 logger.log_res_slice(&url[literal_start..i]);
568 }
569 writer.write_all(&[decoded])?;
570 logger.log_orig(b'%');
571 logger.log_orig(h1);
572 logger.log_orig(h2);
573 logger.log_res(decoded);
574
575 i += 3;
576 literal_start = i;
577 continue;
578 } else {
579 i += 1;
580 continue;
581 }
582 }
583 if url[i] == b'%' {
584 i += 1;
585 } else {
586 match memchr(b'%', &url[i..]) {
587 Some(offset) => i += offset,
588 None => i = len,
589 }
590 }
591 }
592 if literal_start < len {
593 writer.write_all(&url[literal_start..len])?;
594 logger.log_orig_slice(&url[literal_start..len]);
595 logger.log_res_slice(&url[literal_start..len]);
596 }
597
598 logger.print_if_changed(changed);
599 Ok(changed)
600}
601
602pub fn decode_str(
604 input: &str,
605 escape_space: bool,
606 #[cfg(feature = "verbose-log")] verbose: bool,
607) -> Result<(String, bool)> {
608 #[cfg(not(feature = "verbose-log"))]
609 let verbose = false;
610 let mut buf = Vec::with_capacity(input.len());
611
612 let changed = decode!(
613 decode_slice_to_writer(input.as_bytes(), &mut buf, escape_space),
614 verbose
615 )
616 .context(WriteOutputSnafu)?;
617
618 Ok((
619 simdutf8::basic::from_utf8(&buf)
620 .context(InvalidUtf8Snafu)?
621 .to_owned(),
622 changed,
623 ))
624}
625
626pub fn decode_file(
628 path: impl AsRef<Path>,
629 escape_space: bool,
630 dry_run: bool,
631 #[cfg(feature = "verbose-log")] verbose: bool,
632 #[cfg(feature = "verbose-log")] p_counter: &AtomicUsize,
633 #[cfg(feature = "verbose-log")] c_counter: &AtomicUsize,
634) -> Result<()> {
635 #[cfg(not(feature = "verbose-log"))]
636 let verbose = false;
637
638 let path = path.as_ref();
639
640 #[cfg(not(feature = "safe"))]
641 {
642 if !dry_run {
643 return decode_file_in_place(
644 path,
645 escape_space,
646 verbose,
647 #[cfg(feature = "verbose-log")]
648 p_counter,
649 #[cfg(feature = "verbose-log")]
650 c_counter,
651 );
652 }
653 }
654
655 let metadata = fs::metadata(path).context(ReadInputSnafu)?;
657 let file_len = metadata.len();
658
659 if file_len == 0 {
660 #[cfg(feature = "verbose-log")]
661 p_counter.fetch_add(1, Ordering::Relaxed);
662 return Ok(());
663 }
664
665 #[allow(unused)]
666 let mut changed = false;
667
668 #[allow(unused)]
669 if file_len < SMALL_FILE_THRESHOLD {
670 let mut buf = fs::read(path).context(ReadInputSnafu)?;
671 let new_len = decode!(decode_in_place(&mut buf, escape_space), verbose);
672 changed = new_len < buf.len();
673
674 if changed && !dry_run {
675 buf.truncate(new_len);
676 let parent = path.parent().unwrap_or_else(|| Path::new("."));
677
678 let mut temp_file =
679 NamedTempFile::new_in(parent).context(CreateTempSnafu { dir: parent })?;
680
681 temp_file.write_all(&buf).context(WriteOutputSnafu)?;
682 temp_file.flush().context(WriteOutputSnafu)?;
683
684 let _ = temp_file.as_file().set_permissions(metadata.permissions());
686 temp_file.persist(path).context(PersistTempSnafu { path })?;
687 }
688 } else {
689 let file = fs::File::open(path).context(OpenInputSnafu { path })?;
691 let mmap = unsafe {
692 memmap2::MmapOptions::new()
693 .map(&file)
694 .context(ReadInputSnafu)?
695 };
696
697 #[cfg(unix)]
698 mmap.advise(memmap2::Advice::Sequential);
699
700 if dry_run {
701 let mut sink = io::sink();
702 changed = decode!(
703 decode_slice_to_writer(&mmap, &mut sink, escape_space),
704 verbose
705 )
706 .context(WriteOutputSnafu)?;
707 } else {
708 let parent = path.parent().unwrap_or_else(|| Path::new("."));
709
710 let mut temp_file =
711 NamedTempFile::new_in(parent).context(CreateTempSnafu { dir: parent })?;
712
713 {
714 let mut buf_writer = BufWriter::with_capacity(IO_BUF_SIZE, &mut temp_file);
715 changed = decode!(
716 decode_slice_to_writer(&mmap, &mut buf_writer, escape_space),
717 verbose
718 )
719 .context(WriteOutputSnafu)?;
720 buf_writer.flush().context(WriteOutputSnafu)?;
721 }
722
723 drop(mmap);
724 drop(file);
725
726 if changed {
727 let _ = temp_file.as_file().set_permissions(metadata.permissions());
730 temp_file.persist(path).context(PersistTempSnafu { path })?;
731 }
732 }
733 }
734
735 #[cfg(feature = "verbose-log")]
736 {
737 p_counter.fetch_add(1, Ordering::Relaxed);
738 if changed {
739 c_counter.fetch_add(1, Ordering::Relaxed);
740 if verbose {
741 println!("Processed File: {:?}", path);
742 }
743 }
744 }
745
746 Ok(())
747}
748
749#[cfg(test)]
750mod tests {
751
752 use tempfile::NamedTempFile;
753
754 use super::*;
755
756 #[test]
757 fn test_basic() {
758 assert_eq!(
760 decode_str(
761 "https://www.baidu.com/s?ie=UTF-8&wd=%E5%A4%A9%E6%B0%94",
762 false,
763 #[cfg(feature = "verbose-log")]
764 false
765 )
766 .unwrap(),
767 ("https://www.baidu.com/s?ie=UTF-8&wd=天气".into(), true)
768 );
769 assert_eq!(
771 decode_str(
772 "(https://www.baidu.com/s?ie=UTF-8&wd=%E5%A4%A9%E6%B0%94)",
773 false,
774 #[cfg(feature = "verbose-log")]
775 false
776 )
777 .unwrap(),
778 ("(https://www.baidu.com/s?ie=UTF-8&wd=天气)".into(), true)
779 );
780 assert_eq!(
782 decode_str(
783 "https://osu.ppy.sh/beatmapsets?q=malody%204k%20extra%20dan%20v3%E4%B8%AD",
784 true,
785 #[cfg(feature = "verbose-log")]
786 true
787 )
788 .unwrap(),
789 (
790 "https://osu.ppy.sh/beatmapsets?q=malody%204k%20extra%20dan%20v3中".into(),
791 true
792 )
793 );
794 assert_eq!(
796 decode_str(
797 "https://osu.ppy.sh",
798 true,
799 #[cfg(feature = "verbose-log")]
800 false
801 )
802 .unwrap(),
803 ("https://osu.ppy.sh".into(), false)
804 );
805 }
806
807 #[test]
808 fn test_long_url() {
809 let mut url = "https://www.baidu.com/s?ie=UTF-8&wd=%E5%A4%A9%E6%B0%94".to_string();
810 for _ in 0..10000 {
811 url.push_str("%20");
812 }
813 assert_eq!(
814 decode_str(
815 &url,
816 false,
817 #[cfg(feature = "verbose-log")]
818 false
819 )
820 .unwrap(),
821 (
822 "https://www.baidu.com/s?ie=UTF-8&wd=天气".to_string() + " ".repeat(10000).as_str(),
823 true
824 )
825 );
826
827 let base = "a".repeat(60000);
828 assert_eq!(
829 decode_str(
830 &(base.clone() + &url),
831 false,
832 #[cfg(feature = "verbose-log")]
833 false
834 )
835 .unwrap(),
836 (
837 (base + "https://www.baidu.com/s?ie=UTF-8&wd=天气") + " ".repeat(10000).as_str(),
838 true
839 )
840 )
841 }
842
843 #[test]
844 fn test_decode_file() {
845 let temp = NamedTempFile::new().unwrap();
846 let t1 = temp.into_temp_path();
847 let test_str = "xxxxhttps://www.baidu.com/s?ie=UTF-8&wd=%E5%A4%A9%E6%B0%94xxxx";
848 fs::write(&t1, test_str).unwrap();
849
850 decode_file(
851 &t1,
852 false,
853 false,
854 #[cfg(feature = "verbose-log")]
855 false,
856 #[cfg(feature = "verbose-log")]
857 &AtomicUsize::new(0),
858 #[cfg(feature = "verbose-log")]
859 &AtomicUsize::new(0),
860 )
861 .unwrap();
862
863 assert_eq!(
864 fs::read_to_string(t1).unwrap(),
865 "xxxxhttps://www.baidu.com/s?ie=UTF-8&wd=天气xxxx"
866 );
867 }
868}