1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
//! `escape8259` performs RFC8259-compliant string escaping and un-escaping.
//!
//! [RFC8259] is a JSON encoding standard.  Many JSON encoders exist, but other
//! RFCs use the same string escaping mechanism, so it's useful to be able to
//! access the string escaping functions by themselves.
//!
//! # Examples
//!
//! ```
//! use escape8259::{escape, unescape};
//!
//! assert_eq!(unescape(r#"\u0041\n"#).unwrap(), "A\n");
//!
//! let multiline = r#"hello
//!  world"#;
//! assert_eq!(escape(multiline), r#"hello\n world"#);
//! ```
//!
//! [RFC8259]: https://tools.ietf.org/html/rfc8259


#![warn(missing_docs)]

use std::char::decode_utf16;

#[derive(Debug, Clone, Eq, PartialEq)]

/// An error occurred while
pub struct UnescapeError {}

type UnescapeResult<T> = Result<T, UnescapeError>;

// Used to collect output characters and queue u16 values for translation.
struct UnescapeState {
    // The accumulated characters
    out: String,
    // Store a fragment of a large character for later decoding
    stash: u16,
}

impl UnescapeState {
    fn new() -> UnescapeState {
        UnescapeState {
            out: String::new(),
            stash: 0,
        }
    }

    // Collect a new character
    fn push_char(&mut self, c: char) -> UnescapeResult<()> {
        if self.stash != 0 {
            return Err(UnescapeError {});
        }
        self.out.push(c);
        Ok(())
    }

    // Collect a new UTF16 word.  This can either be one whole character,
    // or part of a larger character.
    fn push_u16(&mut self, x: u16) -> UnescapeResult<()> {
        let surrogate = x >= 0xD800 && x <= 0xDFFF;
        match (self.stash, surrogate) {
            (0, false) => {
                // The std library only provides utf16 decode of an iterator,
                // so to decode a single character we wrap it in an array.
                // Hopefully the compiler will elide most of this extra work.
                let words = [x];
                match decode_utf16(words.iter().copied()).next() {
                    Some(Ok(c)) => {
                        self.out.push(c);
                    }
                    _ => return Err(UnescapeError {}),
                }
            }
            (0, true) => self.stash = x,
            (_, false) => {
                return Err(UnescapeError {});
            }
            (w, true) => {
                let words = [w, x];
                match decode_utf16(words.iter().copied()).next() {
                    Some(Ok(c)) => {
                        self.out.push(c);
                        self.stash = 0;
                    }
                    _ => return Err(UnescapeError {}),
                }
            }
        }
        Ok(())
    }

    // If we queued up part of a UTF-16 encoded word but didn't
    // finish it, return an error.  Otherwise, consume self and
    // return the accumulated String.
    fn finalize(self) -> UnescapeResult<String> {
        if self.stash != 0 {
            return Err(UnescapeError {});
        }
        Ok(self.out)
    }
}

fn parse_u16<S>(s: &mut S) -> UnescapeResult<u16>
where
    S: Iterator<Item = char>,
{
    // Placeholder character in case the input doesn't have the 4 chars we want.
    let placeholders = std::iter::repeat('\0');
    let hexnum: String = s.chain(placeholders).take(4).collect();
    u16::from_str_radix(&hexnum, 16).map_err(|_| UnescapeError {})
}

// RFC8259 says non-escaped characters must be in one of the following ranges:
// %x20-21 / %x23-5B / %x5D-10FFFF
fn is_safe_char(c: char) -> bool {
    let safe_ranges = [(0x20..=0x21), (0x23..=0x5B), (0x5D..=0x10FFFF)];
    let cv = c as u32;

    safe_ranges.iter().any(|range| range.contains(&cv))
}

/// Un-escape a string, following RFC8259 rules.
///
/// The only allowed single-character escapes are:
/// `\" \\ \/ /b /f /n /r /t`
///
/// Any other characte may be escaped in UTF-16 form:
/// `\uXXXX` or `\uXXXX\uXXXX`
///
/// Characters in the ranges `0x20-21`, `0x23-5B`, `0x5D-10FFFF`
/// may appear un-escaped.
pub fn unescape(s: &str) -> UnescapeResult<String> {
    let mut state = UnescapeState::new();
    let mut ins = s.chars();

    while let Some(c) = ins.next() {
        if c == '\\' {
            match ins.next() {
                None => {
                    return Err(UnescapeError {});
                }
                Some(d) => {
                    match d {
                        '"' | '\\' | '/' => state.push_char(d)?,
                        'b' => state.push_char('\x08')?, // backspace
                        'f' => state.push_char('\x0C')?, // formfeed
                        'n' => state.push_char('\n')?,   // linefeed
                        'r' => state.push_char('\r')?,   // carriage return
                        't' => state.push_char('\t')?,   // tab
                        'u' => {
                            let val = parse_u16(&mut ins)?;
                            state.push_u16(val)?;
                        }
                        _ => {
                            return Err(UnescapeError {});
                        }
                    }
                }
            }
        } else {
            if is_safe_char(c) {
                state.push_char(c)?;
            } else {
                return Err(UnescapeError {});
            }
        }
    }

    state.finalize()
}

// %x22 /          ; "    quotation mark  U+0022
// %x5C /          ; \    reverse solidus U+005C
// %x2F /          ; /    solidus         U+002F
// %x62 /          ; b    backspace       U+0008
// %x66 /          ; f    form feed       U+000C
// %x6E /          ; n    line feed       U+000A
// %x72 /          ; r    carriage return U+000D
// %x74 /          ; t    tab             U+0009
// %x75 4HEXDIG )  ; uXXXX                U+XXXX

fn force_escape(c: char, out: &mut String) {
    let c = c as u32;
    match c {
        0x08 => out.push_str("\\b"),
        0x09 => out.push_str("\\t"),
        0x0A => out.push_str("\\n"),
        0x0C => out.push_str("\\f"),
        0x0D => out.push_str("\\r"),
        0x22 => out.push_str("\\\""),
        0x5C => out.push_str("\\\\"),
        _ => {
            // RFC8259 allows unicode characters natively, so there is no need
            // to convert everything into \uXXXX form.  The only thing that's
            // required to use that form are the ASCII control characters,
            // which will never require more than one \uXXXX value.
            if c >= 0x20 {
                panic!("force_escape unnecessary encoding requested");
            }
            out.push_str(&format!("\\u{:04x}", c));
        }
    }
}

/// Escape a string, following RFC8259 rules.
///
/// Only characters that require escaping will be escaped:
/// quotation mark `?`,
/// reverse solidus `\` (backslash),
/// and the control characters (0x00-1F).
pub fn escape(s: &str) -> String {
    let mut out = String::new();
    for c in s.chars() {
        if is_safe_char(c) {
            out.push(c);
        } else {
            force_escape(c, &mut out);
        }
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;

    macro_rules! assert_round_trip {
        ($s : expr) => {
            assert_eq!($s, unescape(&escape($s)).unwrap());
        };
    }

    #[test]
    fn test_round_trip() {
        assert_round_trip!("abc");
        assert_round_trip!("\n\r\t\x08\x0C\x00");
        assert_round_trip!(r#"\"#);
        assert_round_trip!(r#"""#);
        assert_round_trip!("Σ𝄞");
        assert_round_trip!(r#"\𝄞"#);
        assert_round_trip!(r#"(╯°□°)╯︵ ┻━┻"#);
    }

    #[test]
    fn test_escape() {
        assert_eq!(escape("\0"), r#"\u0000"#);
        assert_eq!(escape("\n"), r#"\n"#);
        assert_eq!(escape(r#"\"#), r#"\\"#);
        assert_eq!(escape(r#"""#), r#"\""#);
        assert_eq!(escape("Σ"), "Σ"); // U+03A3
        assert_eq!(escape("𝄞"), "𝄞"); // U+1D11E
    }

    #[test]
    fn test_unescape() {
        assert_eq!(unescape(&r#"abc"#), Ok("abc".into()));
        assert_eq!(unescape(&r#"ab\nc"#), Ok("ab\nc".into()));
        assert_eq!(unescape(r#"ab\zc"#), Err(UnescapeError {}));
        assert_eq!(unescape(r#" \"abc\" "#), Ok(" \"abc\" ".into()));
        assert_eq!(unescape(r#"𝄞"#), Ok("𝄞".into()));
        assert_eq!(unescape(r#"\𝄞"#), Err(UnescapeError {}));
        assert_eq!(unescape(r#"\uD834\uDD1E"#), Ok("𝄞".into()));
        assert_eq!(unescape(r#"\uD834"#), Err(UnescapeError {}));
        assert_eq!(unescape(r#"\uDD1E"#), Err(UnescapeError {}));
        assert_eq!(unescape("\t"), Err(UnescapeError {}));
    }
}