1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
//! Utilities for validating string and char literals and turning them into
//! values they represent.
//! From `rust/compiler/rustc_lexer/src/unescape.rs`.
use std::ops::Range;
use std::str::Chars;
/// Errors that can occur during string unescaping.
#[derive(Debug, PartialEq, Eq)]
pub enum EscapeError {
/// Expected 1 char, but 0 were found.
ZeroChars,
/// Expected 1 char, but more than 1 were found.
MoreThanOneChar,
/// Escaped '\' character without continuation.
LoneSlash,
/// Invalid escape character (e.g. '\z').
InvalidEscape,
/// Raw '\r' encountered.
BareCarriageReturn,
/// Raw '\r' encountered in raw string.
BareCarriageReturnInRawString,
/// Unescaped character that was expected to be escaped (e.g. raw '\t').
EscapeOnlyChar,
/// Numeric character escape is too short (e.g. '\x1').
TooShortHexEscape,
/// Invalid character in numeric escape (e.g. '\xz')
InvalidCharInHexEscape,
/// Character code in numeric escape is non-ascii (e.g. '\xFF').
OutOfRangeHexEscape,
/// '\u' not followed by '{'.
NoBraceInUnicodeEscape,
/// Non-hexadecimal value in '\u{..}'.
InvalidCharInUnicodeEscape,
/// '\u{}'
EmptyUnicodeEscape,
/// No closing brace in '\u{..}', e.g. '\u{12'.
UnclosedUnicodeEscape,
/// '\u{_12}'
LeadingUnderscoreUnicodeEscape,
/// More than 6 characters in '\u{..}', e.g. '\u{10FFFF_FF}'
OverlongUnicodeEscape,
/// Invalid in-bound unicode character code, e.g. '\u{DFFF}'.
LoneSurrogateUnicodeEscape,
/// Out of bounds unicode character code, e.g. '\u{FFFFFF}'.
OutOfRangeUnicodeEscape,
/// Unicode escape code in byte literal.
UnicodeEscapeInByte,
/// Non-ascii character in byte literal.
NonAsciiCharInByte,
/// Non-ascii character in byte string literal.
NonAsciiCharInByteString,
}
/// This totally custom function escapes a string.
///
/// We re-use the Rust escaper because we have the same string syntax! Haha!
pub fn escape_str(s: &str) -> String {
format!("{:?}", s)
}
pub fn unescape_str_or_byte_str_all(s: &str) -> String {
if s.contains(&['\\', '\r'][..]) {
let mut buf = String::with_capacity(s.len());
let mut error = false;
unescape_str_or_byte_str(&s, &mut |_, unescaped_char| {
match unescaped_char {
Ok(c) => buf.push(c),
Err(e) => {
error = true;
buf = format!("<Lexer error: string: {:?}>", e);
}
};
});
buf
} else {
s.to_string()
}
}
fn scan_escape(first_char: char, chars: &mut Chars<'_>) -> Result<char, EscapeError> {
if first_char != '\\' {
// Previous character was not a slash, and we don't expect it to be
// an escape-only character.
return match first_char {
'\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
'"' => Err(EscapeError::EscapeOnlyChar),
_ => Ok(first_char),
};
}
// Previous character is '\\', try to unescape it.
let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
let res = match second_char {
'"' => '"',
'n' => '\n',
'r' => '\r',
't' => '\t',
'\\' => '\\',
'\'' => '\'',
'0' => '\0',
'x' => {
// Parse hexadecimal character code.
let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
let value = hi * 16 + lo;
let value = value as u8;
value as char
}
'u' => {
// We've parsed '\u', now we have to parse '{..}'.
if chars.next() != Some('{') {
return Err(EscapeError::NoBraceInUnicodeEscape);
}
// First character must be a hexadecimal digit.
let mut n_digits = 1;
let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
'_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
'}' => return Err(EscapeError::EmptyUnicodeEscape),
c => c
.to_digit(16)
.ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
};
// First character is valid, now parse the rest of the number
// and closing brace.
loop {
match chars.next() {
None => return Err(EscapeError::UnclosedUnicodeEscape),
Some('_') => continue,
Some('}') => {
if n_digits > 6 {
return Err(EscapeError::OverlongUnicodeEscape);
}
// Incorrect syntax has higher priority for error reporting
// than unallowed value for a literal.
break std::char::from_u32(value).ok_or_else(|| {
if value > 0x10FFFF {
EscapeError::OutOfRangeUnicodeEscape
} else {
EscapeError::LoneSurrogateUnicodeEscape
}
})?;
}
Some(c) => {
let digit = c
.to_digit(16)
.ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
n_digits += 1;
if n_digits > 6 {
// Stop updating value since we're sure that it's is incorrect already.
continue;
}
let digit = digit as u32;
value = value * 16 + digit;
}
};
}
}
_ => return Err(EscapeError::InvalidEscape),
};
Ok(res)
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
pub fn unescape_str_or_byte_str<F>(src: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
let initial_len = src.len();
let mut chars = src.chars();
while let Some(first_char) = chars.next() {
let start = initial_len - chars.as_str().len() - first_char.len_utf8();
let unescaped_char = match first_char {
'\\' => {
let second_char = chars.clone().next();
match second_char {
Some('\n') => {
// Rust language specification requires us to skip whitespaces
// if unescaped '\' character is followed by '\n'.
// For details see [Rust language reference]
// (https://doc.rust-lang.org/reference/tokens.html#string-literals).
skip_ascii_whitespace(&mut chars);
continue;
}
_ => scan_escape(first_char, &mut chars),
}
}
'\n' => Ok('\n'),
'\t' => Ok('\t'),
_ => scan_escape(first_char, &mut chars),
};
let end = initial_len - chars.as_str().len();
callback(start..end, unescaped_char);
}
fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
let str = chars.as_str();
let first_non_space = str
.bytes()
.position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
.unwrap_or(str.len());
*chars = str[first_non_space..].chars()
}
}