1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
use std::borrow::Cow;
use std::iter;
use std::string::FromUtf8Error;
use crate::utils::octal_ascii_triple_to_byte;
#[derive(Debug, From)]
pub struct UnquoteStringError {
pub error: FromUtf8Error
}
impl Into<Vec<u8>> for UnquoteStringError {
fn into(self) -> Vec<u8> {
self.into_bytes()
}
}
impl UnquoteStringError {
pub fn into_bytes(self) -> Vec<u8> {
self.error.into_bytes()
}
}
/// unquote_string performs string unquoting
/// According to torCP docs it parses `QuotedString` token.
///
/// # Note
/// In order to be quoted text MUST start with '"' char(no white chars allowed).
///
/// # When quoted?
/// String is considered quoted when it starts with quote and contains at least one unescaped quote after first one.
/// Please note that above implies that first 3 chars out of 100 char string may be used to construct it.
///
/// # Return value
/// If string is not quoted or no escape sequences are in use borrowed cow is returned.
/// If quoted string does not use any escape sequences borrowed cow is returned.
/// If string uses escape sequences which give valid utf8 string owned cow is returned.
/// If quoted string is longer than entire text
/// Otherwise error is returned
/// This function does not if content of quotes is valid utf8/is single line or if it does contains any sequence considered as
/// invalid when looking at standard this implies that return value MAY
/// contain non-ascii chars OR zero bytes. It's caller responsibility to filter them if needed.
///
/// Second returned offset(first returned one) value is `Some` only when string is quoted. It returns byte offset of last char consumed in string unquoting.
/// Using `text.as_bytes()[idx]` where idx is given value should yield '"' char.
pub fn unquote_string(text: &str) -> (Option<usize>, Result<Cow<str>, UnquoteStringError>) {
// as the docs says:
// The format is:
// RFC 2822(not entire ofc. Some random things needed to interpret the specification)
// -----
// qtext = NO-WS-CTL / ; Non white space controls
//
// %d33 / ; The rest of the US-ASCII
// %d35-91 / ; characters not including "\"
// %d93-126 ; or the quote character
//
// qcontent = qtext / quoted-pair
// quoted-pair = ("\" text) / obs-qp
// obs-qp = "\" (%d0-127)
// -----
// (Note: I guess text is a-zA-Z0-9)
// And from torCP spec:
// DQUOTE is this thing in the middle: ---> " <---
// QuotedString = DQUOTE *qcontent DQUOTE
//
//
// "All 8-bit characters are permitted unless explicitly disallowed. In QuotedStrings,
// backslashes and quotes must be escaped; other characters need not be
// escaped."
// quoted printable rules are simple:
// "For future-proofing, controller implementors MAY use the following
// rules to be compatible with buggy Tor implementations and with
// future ones that implement the spec as intended:
// Read \n \t \r and \0 ... \377 as C escapes.
// Treat a backslash followed by any other character as that character."
if text.len() == 0 {
return (None, Ok(Cow::Borrowed(&text[..0])));
}
if text.len() >= 2 {
let end_of_quoted_string = {
let mut is_ignored = false;
let mut idx = 0;
let mut found = false;
// first one is our first quote(at least potentially) anyway - it can't be last quote
for c in text.chars().skip(1) {
if !is_ignored {
if c == '\\' {
is_ignored = true;
} else if c == '"' {
// we found it! first unquoted quote!
idx += c.len_utf8();
found = true;
break;
}
} else {
is_ignored = false;
}
idx += c.len_utf8();
}
if found {
debug_assert!(text.as_bytes()[idx] == b'"');
Some(idx)
} else {
None
}
};
return if text.as_bytes()[0] == b'\"' && end_of_quoted_string.is_some() {
let end_of_quoted_string = end_of_quoted_string.unwrap();
let text = &text[1..end_of_quoted_string];
if text.chars().all(|c| c != '\\') {
// no escape sequences!
// just return value
return (Some(end_of_quoted_string), Ok(Cow::Borrowed(&text[..])));
}
// just put escape seqs to vec and then create string
let mut res = Vec::new();
let mut is_escaped = false;
let mut escaped_char_buf = [0u8; 3];
let mut escaped_char_buf_sz = 0;
// eprintln!("Unquoting: {:?}", text);
for c in text.chars() {
let mut char_to_process = Some(c);
while let Some(c) = char_to_process.take() {
// eprintln!("Got char: {:?}", c);
if is_escaped {
if escaped_char_buf_sz == 0 {
match c {
'n' => res.push(b'\n'),
't' => res.push(b'\t'),
'r' => res.push(b'\r'),
'"' => res.push(b'\"'),
'\\' => res.push(b'\\'),
c if c.is_ascii_digit() => {
// put char into escaped buffer and go to another iteration of loop
escaped_char_buf[0] = c as u8;
escaped_char_buf_sz += 1;
continue;
}
c => {
// put char as-is
res.extend(iter::repeat(0).take(c.len_utf8()));
let len = res.len();
c.encode_utf8(&mut res[len - c.len_utf8()..]);
}
}
} else {
// another octal digit
if c.is_ascii_digit() && /*is valid octal digit*/ (c as u8 - b'0') <= 7 && escaped_char_buf_sz < 3 {
escaped_char_buf[escaped_char_buf_sz] = c as u8;
escaped_char_buf_sz += 1;
continue;
} else {
// current char was not processed
// reschedule it to process
char_to_process = Some(c);
// note: this code is copy pasted below
// consider fixing it as well when fixing this part
// rotate buf in case there is less than required amount of chars
// so [1 0 0] sz = 1 becomes [0 0 1]
let len = escaped_char_buf.len();
escaped_char_buf.rotate_right(len - escaped_char_buf_sz);
// eprintln!("Triple to byte: {:?}", escaped_char_buf);
if let Some(v) = octal_ascii_triple_to_byte(escaped_char_buf) {
// eprintln!("success: {}", v);
res.push(v);
} else {
// eprintln!("failed: {:?}", escaped_char_buf);
// push it as raw(not decoded) value without first char
// as if backslash was ignored
res.extend_from_slice(&escaped_char_buf[..escaped_char_buf_sz]);
}
}
}
escaped_char_buf = [0u8; 3];
escaped_char_buf_sz = 0;
is_escaped = false;
} else {
if c == '\\' {
is_escaped = true;
}
// we have handled all quotes before
/* else if c == '\"' {
// apparently end of quoted string!
break;
} */ else {
res.extend(iter::repeat(0).take(c.len_utf8()));
let len = res.len();
c.encode_utf8(&mut res[len - c.len_utf8()..]);
}
}
}
}
if escaped_char_buf_sz > 0 {
// eprintln!("Found one more octet(at least potential) to process!");
// TODO(teawithsand): clean it up. This is copy paste from above code processing octets.
let len = escaped_char_buf.len();
escaped_char_buf.rotate_right(len - escaped_char_buf_sz);
// eprintln!("Triple to byte: {:?}", escaped_char_buf);
if let Some(v) = octal_ascii_triple_to_byte(escaped_char_buf) {
// eprintln!("success: {}", v);
res.push(v);
} else {
// eprintln!("failed: {:?}", escaped_char_buf);
// push it as raw(not decoded) value without first char
// as if backslash was ignored
res.extend_from_slice(&escaped_char_buf[..escaped_char_buf_sz]);
}
}
// eprintln!("RES: {:?}", res);
let res = String::from_utf8(res)
.map(|v| Cow::Owned(v))
.map_err(|e| UnquoteStringError::from(e));
(Some(end_of_quoted_string), res)
} else {
(None, Ok(Cow::Borrowed(&text[..])))
};
}
// ofc single char text can't be quoted string
(None, Ok(Cow::Borrowed(&text[..])))
}
/// quote_string takes arbitrary binary data and encodes it using octal encoding.
/// For \n \t and \r it uses these backslash notation rather than octal encoding.
///
/// It's reverse function to `unquote_string`.
/// According to torCP docs it creates `QuotedString` token.
///
/// # Example
/// ```
/// use torut::utils::quote_string;
/// assert_eq!(quote_string(b"asdf"), r#""asdf""#);
/// assert_eq!(quote_string("ŁŁ".as_bytes()), r#""\305\201\305\201""#);
/// assert_eq!(quote_string("\n\r\t".as_bytes()), r#""\n\r\t""#);
/// assert_eq!(quote_string("\0\0\0".as_bytes()), r#""\0\0\0""#);
/// ```
pub fn quote_string(text: &[u8]) -> String {
// res won't be shorter than text ever
let mut res = String::with_capacity(text.len() + 2);
res.push('\"');
for b in text.iter().copied() {
match b {
b'\n' => res.push_str("\\n"),
b'\r' => res.push_str("\\r"),
b'\t' => res.push_str("\\t"),
b'\\' => res.push_str("\\\\"),
b'"' => res.push_str("\\\""),
b if b.is_ascii_alphanumeric() || b.is_ascii_punctuation() => {
res.push(b as char);
}
b => {
res.push('\\');
// oct encode given char
let mut b = b;
let mut digit_count = 0;
let mut digits = [0u8; 3];
if b > 0 {
while b > 0 {
digits[digit_count] = b % 8;
b = b / 8;
digit_count += 1;
}
} else {
// null byte is \0 but above algo won't find it out
digit_count = 1;
}
debug_assert!(digit_count >= 1);
for d in digits.iter().take(digit_count).rev() {
res.push((*d + b'0') as char);
}
}
}
}
res.push('\"');
res
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_can_quote_and_unquote_string() {
for input in [
"asdf",
"\0\0\0\0",
"ŁŁŁ",
r#"""""#,
"\n\t\r",
].iter().cloned() {
assert_eq!(
input,
unquote_string("e_string(input.as_bytes())).1.unwrap().as_ref()
)
}
}
#[test]
fn test_can_unquote_string() {
for (input, output) in [
("not quoted string", (None, Ok("not quoted string"))),
("\"and a quoted one\"", (Some(17), Ok("and a quoted one"))),
("\"esc backslash \\\\ \"", (Some(18), Ok("esc backslash \\ "))),
(r#""\0\0\0\0\213\321\3\123\312\31\221\312""#, (
Some(38),
Err(&[0u8, 0, 0, 0, 0o213, 0o321, 0o3, 0o123, 0o312, 0o31, 0o221, 0o312] as &[u8])
)),
(r#""\0\0\0\0\213\321\3\123\312\31\221\31""#, (
Some(37),
Err(&[0u8, 0, 0, 0, 0o213, 0o321, 0o3, 0o123, 0o312, 0o31, 0o221, 0o31] as &[u8])
)),
(r#""\0\0\0\0\213\321\3\123\312\31\221\3""#, (
Some(36),
Err(&[0u8, 0, 0, 0, 0o213, 0o321, 0o3, 0o123, 0o312, 0o31, 0o221, 0o3] as &[u8])
)),
("\"q\\\"q\"", (Some(5), Ok("q\"q"))),
("\"first\"\"second\"", (Some(6), Ok("first"))),
].iter().cloned() {
let (expected_offset, expected_value) = output;
let (offset, value) = unquote_string(input);
let value = value.map_err(|e| e.into_bytes());
assert_eq!(offset, expected_offset);
assert_eq!(
value
.as_ref()
.map(|v| v.as_ref())
.map_err(|e| e.as_ref()),
expected_value
);
}
}
}