1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
use std::borrow::Cow;
use std::iter;
use std::string::FromUtf8Error;

use crate::utils::octal_ascii_triple_to_byte;

#[derive(Debug, From)]
pub struct UnquoteStringError {
    pub error: FromUtf8Error
}

impl Into<Vec<u8>> for UnquoteStringError {
    fn into(self) -> Vec<u8> {
        self.into_bytes()
    }
}

impl UnquoteStringError {
    pub fn into_bytes(self) -> Vec<u8> {
        self.error.into_bytes()
    }
}

/// unquote_string performs string unquoting
/// According to torCP docs it parses `QuotedString` token.
///
/// # Note
/// In order to be quoted text MUST start with '"' char(no white chars allowed).
///
/// # When quoted?
/// String is considered quoted when it starts with quote and contains at least one unescaped quote after first one.
/// Please note that above implies that first 3 chars out of 100 char string may be used to construct it.
///
/// # Return value
/// If string is not quoted or no escape sequences are in use borrowed cow is returned.
/// If quoted string does not use any escape sequences borrowed cow is returned.
/// If string uses escape sequences which give valid utf8 string owned cow is returned.
/// If quoted string is longer than entire text
/// Otherwise error is returned
/// This function does not if content of quotes is valid utf8/is single line or if it does contains any sequence considered as
/// invalid when looking at standard this implies that return value MAY
/// contain non-ascii chars OR zero bytes. It's caller responsibility to filter them if needed.
///
/// Second returned offset(first returned one) value is `Some` only when string is quoted. It returns byte offset of last char consumed in string unquoting.
/// Using `text.as_bytes()[idx]` where idx is given value should yield '"' char.
pub fn unquote_string(text: &str) -> (Option<usize>, Result<Cow<str>, UnquoteStringError>) {
    // as the docs says:
    // The format is:
    // RFC 2822(not entire ofc. Some random things needed to interpret the specification)
    // -----
    // qtext           =       NO-WS-CTL /     ; Non white space controls
    //
    //                         %d33 /          ; The rest of the US-ASCII
    //                         %d35-91 /       ;  characters not including "\"
    //                         %d93-126        ;  or the quote character
    //
    // qcontent        =       qtext / quoted-pair
    // quoted-pair     =       ("\" text) / obs-qp
    // obs-qp          =       "\" (%d0-127)
    // -----
    // (Note: I guess text is a-zA-Z0-9)
    // And from torCP spec:
    // DQUOTE is this thing in the middle: ---> " <---
    // QuotedString = DQUOTE *qcontent DQUOTE
    //
    //
    // "All 8-bit characters are permitted unless explicitly disallowed. In QuotedStrings,
    // backslashes and quotes must be escaped; other characters need not be
    // escaped."

    // quoted printable rules are simple:
    //  "For future-proofing, controller implementors MAY use the following
    //  rules to be compatible with buggy Tor implementations and with
    //  future ones that implement the spec as intended:
    //    Read \n \t \r and \0 ... \377 as C escapes.
    //    Treat a backslash followed by any other character as that character."
    if text.len() == 0 {
        return (None, Ok(Cow::Borrowed(&text[..0])));
    }
    if text.len() >= 2 {
        let end_of_quoted_string = {
            let mut is_ignored = false;
            let mut idx = 0;
            let mut found = false;
            // first one is our first quote(at least potentially) anyway - it can't be last quote
            for c in text.chars().skip(1) {
                if !is_ignored {
                    if c == '\\' {
                        is_ignored = true;
                    } else if c == '"' {
                        // we found it! first unquoted quote!
                        idx += c.len_utf8();
                        found = true;
                        break;
                    }
                } else {
                    is_ignored = false;
                }
                idx += c.len_utf8();
            }
            if found {
                debug_assert!(text.as_bytes()[idx] == b'"');
                Some(idx)
            } else {
                None
            }
        };
        return if text.as_bytes()[0] == b'\"' && end_of_quoted_string.is_some() {
            let end_of_quoted_string = end_of_quoted_string.unwrap();

            let text = &text[1..end_of_quoted_string];
            if text.chars().all(|c| c != '\\') {
                // no escape sequences!
                // just return value
                return (Some(end_of_quoted_string), Ok(Cow::Borrowed(&text[..])));
            }
            // just put escape seqs to vec and then create string
            let mut res = Vec::new();
            let mut is_escaped = false;

            let mut escaped_char_buf = [0u8; 3];
            let mut escaped_char_buf_sz = 0;
            // eprintln!("Unquoting: {:?}", text);
            for c in text.chars() {
                let mut char_to_process = Some(c);
                while let Some(c) = char_to_process.take() {
                    // eprintln!("Got char: {:?}", c);

                    if is_escaped {
                        if escaped_char_buf_sz == 0 {
                            match c {
                                'n' => res.push(b'\n'),
                                't' => res.push(b'\t'),
                                'r' => res.push(b'\r'),
                                '"' => res.push(b'\"'),
                                '\\' => res.push(b'\\'),
                                c if c.is_ascii_digit() => {
                                    // put char into escaped buffer and go to another iteration of loop
                                    escaped_char_buf[0] = c as u8;
                                    escaped_char_buf_sz += 1;
                                    continue;
                                }
                                c => {
                                    // put char as-is
                                    res.extend(iter::repeat(0).take(c.len_utf8()));
                                    let len = res.len();
                                    c.encode_utf8(&mut res[len - c.len_utf8()..]);
                                }
                            }
                        } else {

                            // another octal digit
                            if c.is_ascii_digit() && /*is valid octal digit*/ (c as u8 - b'0') <= 7 && escaped_char_buf_sz < 3 {
                                escaped_char_buf[escaped_char_buf_sz] = c as u8;
                                escaped_char_buf_sz += 1;
                                continue;
                            } else {
                                // current char was not processed
                                // reschedule it to process
                                char_to_process = Some(c);

                                // note: this code is copy pasted below
                                // consider fixing it as well when fixing this part

                                // rotate buf in case there is less than required amount of chars
                                // so [1 0 0] sz = 1 becomes [0 0 1]
                                let len = escaped_char_buf.len();
                                escaped_char_buf.rotate_right(len - escaped_char_buf_sz);
                                // eprintln!("Triple to byte: {:?}", escaped_char_buf);
                                if let Some(v) = octal_ascii_triple_to_byte(escaped_char_buf) {
                                    // eprintln!("success: {}", v);

                                    res.push(v);
                                } else {
                                    // eprintln!("failed: {:?}", escaped_char_buf);

                                    // push it as raw(not decoded) value without first char
                                    // as if backslash was ignored
                                    res.extend_from_slice(&escaped_char_buf[..escaped_char_buf_sz]);
                                }
                            }
                        }
                        escaped_char_buf = [0u8; 3];
                        escaped_char_buf_sz = 0;
                        is_escaped = false;
                    } else {
                        if c == '\\' {
                            is_escaped = true;
                        }
                        // we have handled all quotes before
                        /* else if c == '\"' {
                            // apparently end of quoted string!
                            break;
                        } */ else {
                            res.extend(iter::repeat(0).take(c.len_utf8()));
                            let len = res.len();
                            c.encode_utf8(&mut res[len - c.len_utf8()..]);
                        }
                    }
                }
            }
            if escaped_char_buf_sz > 0 {
                // eprintln!("Found one more octet(at least potential) to process!");

                // TODO(teawithsand): clean it up. This is copy paste from above code processing octets.
                let len = escaped_char_buf.len();
                escaped_char_buf.rotate_right(len - escaped_char_buf_sz);
                // eprintln!("Triple to byte: {:?}", escaped_char_buf);
                if let Some(v) = octal_ascii_triple_to_byte(escaped_char_buf) {
                    // eprintln!("success: {}", v);

                    res.push(v);
                } else {
                    // eprintln!("failed: {:?}", escaped_char_buf);

                    // push it as raw(not decoded) value without first char
                    // as if backslash was ignored
                    res.extend_from_slice(&escaped_char_buf[..escaped_char_buf_sz]);
                }
            }
            // eprintln!("RES: {:?}", res);
            let res = String::from_utf8(res)
                .map(|v| Cow::Owned(v))
                .map_err(|e| UnquoteStringError::from(e));
            (Some(end_of_quoted_string), res)
        } else {
            (None, Ok(Cow::Borrowed(&text[..])))
        };
    }
    // ofc single char text can't be quoted string
    (None, Ok(Cow::Borrowed(&text[..])))
}

/// quote_string takes arbitrary binary data and encodes it using octal encoding.
/// For \n \t and \r it uses these backslash notation rather than octal encoding.
///
/// It's reverse function to `unquote_string`.
/// According to torCP docs it creates `QuotedString` token.
///
/// # Example
/// ```
/// use torut::utils::quote_string;
/// assert_eq!(quote_string(b"asdf"), r#""asdf""#);
/// assert_eq!(quote_string("ŁŁ".as_bytes()), r#""\305\201\305\201""#);
/// assert_eq!(quote_string("\n\r\t".as_bytes()), r#""\n\r\t""#);
/// assert_eq!(quote_string("\0\0\0".as_bytes()), r#""\0\0\0""#);
/// ```
pub fn quote_string(text: &[u8]) -> String {
    // res won't be shorter than text ever
    let mut res = String::with_capacity(text.len() + 2);
    res.push('\"');
    for b in text.iter().copied() {
        match b {
            b'\n' => res.push_str("\\n"),
            b'\r' => res.push_str("\\r"),
            b'\t' => res.push_str("\\t"),
            b'\\' => res.push_str("\\\\"),
            b'"' => res.push_str("\\\""),
            b if b.is_ascii_alphanumeric() || b.is_ascii_punctuation() => {
                res.push(b as char);
            }
            b => {
                res.push('\\');
                // oct encode given char
                let mut b = b;
                let mut digit_count = 0;
                let mut digits = [0u8; 3];
                if b > 0 {
                    while b > 0 {
                        digits[digit_count] = b % 8;
                        b = b / 8;
                        digit_count += 1;
                    }
                } else {
                    // null byte is \0 but above algo won't find it out
                    digit_count = 1;
                }
                debug_assert!(digit_count >= 1);
                for d in digits.iter().take(digit_count).rev() {
                    res.push((*d + b'0') as char);
                }
            }
        }
    }
    res.push('\"');
    res
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn test_can_quote_and_unquote_string() {
        for input in [
            "asdf",
            "\0\0\0\0",
            "ŁŁŁ",
            r#"""""#,
            "\n\t\r",
        ].iter().cloned() {
            assert_eq!(
                input,
                unquote_string(&quote_string(input.as_bytes())).1.unwrap().as_ref()
            )
        }
    }

    #[test]
    fn test_can_unquote_string() {
        for (input, output) in [
            ("not quoted string", (None, Ok("not quoted string"))),
            ("\"and a quoted one\"", (Some(17), Ok("and a quoted one"))),
            ("\"esc backslash \\\\ \"", (Some(18), Ok("esc backslash \\ "))),
            (r#""\0\0\0\0\213\321\3\123\312\31\221\312""#, (
                Some(38),
                Err(&[0u8, 0, 0, 0, 0o213, 0o321, 0o3, 0o123, 0o312, 0o31, 0o221, 0o312] as &[u8])
            )),
            (r#""\0\0\0\0\213\321\3\123\312\31\221\31""#, (
                Some(37),
                Err(&[0u8, 0, 0, 0, 0o213, 0o321, 0o3, 0o123, 0o312, 0o31, 0o221, 0o31] as &[u8])
            )),
            (r#""\0\0\0\0\213\321\3\123\312\31\221\3""#, (
                Some(36),
                Err(&[0u8, 0, 0, 0, 0o213, 0o321, 0o3, 0o123, 0o312, 0o31, 0o221, 0o3] as &[u8])
            )),
            ("\"q\\\"q\"", (Some(5), Ok("q\"q"))),
            ("\"first\"\"second\"", (Some(6), Ok("first"))),
        ].iter().cloned() {
            let (expected_offset, expected_value) = output;
            let (offset, value) = unquote_string(input);
            let value = value.map_err(|e| e.into_bytes());
            assert_eq!(offset, expected_offset);
            assert_eq!(
                value
                    .as_ref()
                    .map(|v| v.as_ref())
                    .map_err(|e| e.as_ref()),
                expected_value
            );
        }
    }
}