util/
utf8.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8use std::fmt::Write;
9
10/// Convert input bytes to String by escaping invalid UTF-8 as escaped hex bytes
11/// (e.g. "\xC3").
12pub fn escape_non_utf8(mut input: &[u8]) -> String {
13    let mut output = String::new();
14
15    while !input.is_empty() {
16        let (valid_len, invalid_len) = match std::str::from_utf8(input) {
17            Ok(_) => (input.len(), 0),
18            Err(err) => (
19                err.valid_up_to(),
20                err.error_len().unwrap_or(input.len() - err.valid_up_to()),
21            ),
22        };
23
24        // input starts with valid_len bytes of utf8 followed by invalid_len
25        // bytes of non-utf8 (followed by more bytes that need checking).
26
27        output.push_str(unsafe { std::str::from_utf8_unchecked(&input[..valid_len]) });
28        input = &input[valid_len..];
29
30        for b in &input[..invalid_len] {
31            write!(output, r"\x{:X}", b).unwrap();
32        }
33        input = &input[invalid_len..]
34    }
35
36    output
37}
38
39#[cfg(test)]
40mod tests {
41    use super::*;
42
43    #[test]
44    fn test_escape_non_utf8() {
45        assert_eq!(escape_non_utf8(b""), "");
46        assert_eq!(escape_non_utf8(b"hello"), "hello");
47
48        assert_eq!(escape_non_utf8(b"\xc3"), r"\xC3");
49        assert_eq!(escape_non_utf8(b"\xc3A"), r"\xC3A");
50        assert_eq!(escape_non_utf8(b"A\xc3"), r"A\xC3");
51
52        let nihao = "你好".as_bytes();
53        assert_eq!(escape_non_utf8(nihao), "你好");
54        assert_eq!(escape_non_utf8(&[b"\xc3", nihao].concat()), r"\xC3你好");
55        assert_eq!(escape_non_utf8(&[nihao, b"\xc3"].concat()), r"你好\xC3");
56        assert_eq!(
57            escape_non_utf8(&[nihao, b"\xc3", nihao].concat()),
58            r"你好\xC3你好"
59        );
60    }
61}