1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
use std::borrow::Cow;
use std::ffi::CStr;
use std::str;

/// A utility for converting from outside-world POSIX-oriented strings, such
/// as command-line arguments and environment variables, into UTF-8 strings,
/// using the ARF encoding technique to handle unencodable byte sequences.
pub struct WasiString<'str>(Cow<'str, str>);

impl<'str> WasiString<'str> {
    /// Construct a `WasiString` with data copied from the given `&CStr`,
    /// using ARF encoding as needed to ensure that the result is valid UTF-8.
    pub fn from_maybe_nonutf8_cstr(cstr: &'str CStr) -> Self {
        let bytes = cstr.to_bytes();
        if let Ok(s) = str::from_utf8(bytes) {
            return Self(Cow::Borrowed(s));
        }

        Self::from_nonutf8_cstr(bytes)
    }

    /// Slow path for `from_maybe_nonutf8_cstr`.
    fn from_nonutf8_cstr(bytes: &[u8]) -> Self {
        let mut data = String::new();

        data.push('\u{feff}');

        let mut input = bytes;
        loop {
            match std::str::from_utf8(input) {
                Ok(valid) => {
                    data.push_str(valid);
                    break;
                }
                Err(error) => {
                    let (valid, after_valid) = input.split_at(error.valid_up_to());
                    unsafe { data.push_str(str::from_utf8_unchecked(valid)) }
                    data.push('\u{FFFD}');

                    if let Some((_, remaining)) = after_valid.split_first() {
                        input = remaining;
                    } else {
                        break;
                    }
                }
            }
        }

        data.push('\0');

        let mut input = bytes;
        loop {
            match std::str::from_utf8(input) {
                Ok(valid) => {
                    data.push_str(valid);
                    break;
                }
                Err(error) => {
                    let (valid, after_valid) = input.split_at(error.valid_up_to());

                    unsafe { data.push_str(str::from_utf8_unchecked(valid)) }
                    if let Some((byte, remaining)) = after_valid.split_first() {
                        data.push('\0');
                        data.push((byte & 0x7f) as char);
                        input = remaining;
                    } else {
                        break;
                    }
                }
            }
        }

        Self(Cow::Owned(data))
    }

    /// Return a reference to the valid UTF-8 contents.
    pub fn as_str(&self) -> &str {
        &self.0
    }
}

#[test]
fn valid_utf8() {
    assert_eq!(
        WasiString::from_maybe_nonutf8_cstr(CStr::from_bytes_with_nul(b"\0").unwrap()).as_str(),
        ""
    );
    assert_eq!(
        WasiString::from_maybe_nonutf8_cstr(CStr::from_bytes_with_nul(b"foo\0").unwrap()).as_str(),
        "foo"
    );
}

#[test]
fn not_utf8() {
    assert_eq!(
        WasiString::from_maybe_nonutf8_cstr(CStr::from_bytes_with_nul(b"\xfe\0").unwrap()).as_str(),
        "\u{feff}\u{fffd}\0\0\u{7e}"
    );
    assert_eq!(
        WasiString::from_maybe_nonutf8_cstr(CStr::from_bytes_with_nul(b"\xc0\xff\0").unwrap())
            .as_str(),
        "\u{feff}\u{fffd}\u{fffd}\0\0\u{40}\0\u{7f}"
    );
    assert_eq!(
        WasiString::from_maybe_nonutf8_cstr(CStr::from_bytes_with_nul(b"\xef\xbb\xbf\0").unwrap())
            .as_str(),
        "\u{feff}"
    );
    assert_eq!(
        WasiString::from_maybe_nonutf8_cstr(
            CStr::from_bytes_with_nul(b"\xef\xbb\xbf\xfd\0").unwrap()
        )
        .as_str(),
        "\u{feff}\u{feff}\u{fffd}\0\u{feff}\0\x7d"
    );
}