jsonb/
util.rs

1// Copyright 2023 Datafuse Labs.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use super::constants::*;
16use super::error::Error;
17use super::error::ParseErrorCode;
18
19#[allow(clippy::zero_prefixed_literal)]
20static HEX: [u8; 256] = {
21    const __: u8 = 255; // not a hex digit
22    [
23        //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
24        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 0
25        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 1
26        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
27        00, 01, 02, 03, 04, 05, 06, 07, 08, 09, __, __, __, __, __, __, // 3
28        __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 4
29        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 5
30        __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 6
31        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
32        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
33        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
34        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
35        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
36        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
37        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
38        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
39        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
40    ]
41};
42
43pub fn parse_string(mut data: &[u8], len: usize, idx: &mut usize) -> Result<String, Error> {
44    let mut buf = Vec::with_capacity(len);
45    let mut str_buf = String::with_capacity(4);
46    while !data.is_empty() {
47        *idx += 1;
48        let byte = data[0];
49        if byte == b'\\' {
50            data = &data[1..];
51            data = parse_escaped_string(data, idx, &mut str_buf)?;
52            buf.extend_from_slice(str_buf.as_bytes());
53            str_buf.clear();
54        } else {
55            buf.push(byte);
56            data = &data[1..];
57        }
58    }
59    String::from_utf8(buf).map_err(|_| Error::Syntax(ParseErrorCode::InvalidStringValue, *idx))
60}
61
62fn parse_escaped_string<'a>(
63    mut data: &'a [u8],
64    idx: &mut usize,
65    str_buf: &mut String,
66) -> Result<&'a [u8], Error> {
67    if data.is_empty() {
68        return Err(Error::Syntax(
69            ParseErrorCode::UnexpectedEndOfHexEscape,
70            *idx,
71        ));
72    }
73
74    let byte = data[0];
75    *idx += 1;
76    data = &data[1..];
77    match byte {
78        b'\\' => str_buf.push(BS),
79        b'"' => str_buf.push(QU),
80        b'/' => str_buf.push(SD),
81        b'b' => str_buf.push(BB),
82        b'f' => str_buf.push(FF),
83        b'n' => str_buf.push(NN),
84        b'r' => str_buf.push(RR),
85        b't' => str_buf.push(TT),
86        b'u' => {
87            let mut numbers = [0u8; UNICODE_LEN];
88            // Parse the first Unicode escape sequence
89            data = parse_unicode_escape(data, idx, &mut numbers)?;
90            let hex = decode_hex_escape(&numbers, idx)?;
91
92            let c = match hex {
93                0xDC00..=0xDFFF => {
94                    // Low surrogate without preceding high surrogate
95                    encode_invalid_unicode(&numbers, str_buf);
96                    return Ok(data);
97                }
98
99                // Non-BMP characters are encoded as a sequence of two hex
100                // escapes, representing UTF-16 surrogates.
101                n1 @ 0xD800..=0xDBFF => {
102                    // High surrogate - check for following low surrogate
103                    if data.len() < 2 {
104                        encode_invalid_unicode(&numbers, str_buf);
105                        return Ok(data);
106                    }
107
108                    // Check for \u sequence
109                    if data[0] == b'\\' && data[1] == b'u' {
110                        *idx += 2;
111                        data = &data[2..];
112                    } else {
113                        encode_invalid_unicode(&numbers, str_buf);
114                        return Ok(data);
115                    }
116
117                    let mut lower_numbers = [0u8; UNICODE_LEN];
118                    // Parse the second Unicode escape sequence
119                    data = parse_unicode_escape(data, idx, &mut lower_numbers)?;
120                    let n2 = decode_hex_escape(&lower_numbers, idx)?;
121                    if !(0xDC00..=0xDFFF).contains(&n2) {
122                        encode_invalid_unicode(&numbers, str_buf);
123                        encode_invalid_unicode(&lower_numbers, str_buf);
124                        return Ok(data);
125                    }
126
127                    #[allow(clippy::precedence)]
128                    let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
129
130                    match char::from_u32(n) {
131                        Some(ch) => ch,
132                        None => {
133                            // Handle invalid Unicode code points gracefully
134                            // If we somehow got an invalid code point, preserve the original escape sequence
135                            encode_invalid_unicode(&numbers, str_buf);
136                            encode_invalid_unicode(&lower_numbers, str_buf);
137                            return Ok(data);
138                        }
139                    }
140                }
141
142                // Regular Unicode code points
143                n => match char::from_u32(n as u32) {
144                    Some(ch) => ch,
145                    None => {
146                        // Handle invalid code points gracefully
147                        encode_invalid_unicode(&numbers, str_buf);
148                        return Ok(data);
149                    }
150                },
151            };
152            str_buf.push(c);
153        }
154        other => return Err(Error::Syntax(ParseErrorCode::InvalidEscaped(other), *idx)),
155    }
156    Ok(data)
157}
158
159/// Parse a Unicode escape sequence and return the updated data slice
160///
161/// This helper function handles both standard \uXXXX and extended \u{XXXX} formats,
162/// extracting the hex digits into the provided buffer.
163#[inline]
164fn parse_unicode_escape<'a>(
165    mut data: &'a [u8],
166    idx: &mut usize,
167    numbers: &mut [u8; UNICODE_LEN],
168) -> Result<&'a [u8], Error> {
169    if data.len() < UNICODE_LEN {
170        return Err(Error::Syntax(
171            ParseErrorCode::UnexpectedEndOfHexEscape,
172            *idx,
173        ));
174    }
175    // Handle \u{XXXX} format (with braces)
176    if data[0] == b'{' {
177        if data.len() < UNICODE_LEN + 2 {
178            return Err(Error::Syntax(
179                ParseErrorCode::UnexpectedEndOfHexEscape,
180                *idx,
181            ));
182        }
183
184        numbers.copy_from_slice(&data[1..UNICODE_LEN + 1]);
185        if data[UNICODE_LEN + 1] != b'}' {
186            return Err(Error::Syntax(
187                ParseErrorCode::UnexpectedEndOfHexEscape,
188                *idx,
189            ));
190        }
191
192        data = &data[UNICODE_LEN + 2..];
193        *idx += UNICODE_LEN + 2;
194    } else {
195        // Standard \uXXXX format
196        numbers.copy_from_slice(&data[..UNICODE_LEN]);
197        data = &data[UNICODE_LEN..];
198        *idx += UNICODE_LEN;
199    }
200
201    Ok(data)
202}
203
204// https://datatracker.ietf.org/doc/html/rfc8259#section-8.2
205// RFC8259 allow invalid Unicode
206#[inline]
207fn encode_invalid_unicode(numbers: &[u8], str_buf: &mut String) {
208    str_buf.push('\\');
209    str_buf.push('u');
210    for n in numbers {
211        str_buf.push((*n).into());
212    }
213}
214
215#[inline]
216fn decode_hex_val(val: u8) -> Option<u16> {
217    let n = HEX[val as usize] as u16;
218    if n == 255 {
219        None
220    } else {
221        Some(n)
222    }
223}
224
225#[inline]
226fn decode_hex_escape(numbers: &[u8], idx: &usize) -> Result<u16, Error> {
227    let mut n = 0;
228    for number in numbers {
229        if let Some(hex) = decode_hex_val(*number) {
230            n = (n << 4) + hex;
231        } else {
232            return Err(Error::Syntax(ParseErrorCode::InvalidHex(*number), *idx));
233        }
234    }
235    Ok(n)
236}
237
238#[cfg(test)]
239mod tests {
240    use super::*;
241    use proptest::prelude::*;
242    use std::fmt::Write;
243
244    #[test]
245    fn test_parse_string() {
246        // Test cases with expected results
247        let test_cases = vec![
248            // Basic strings
249            ("hello", "hello"),
250            ("", ""),
251            ("123", "123"),
252            // Escaped characters
253            (r#"hello\nworld"#, "hello\nworld"),
254            (r#"\"\\\b\f\n\r\t"#, "\"\\\u{8}\u{c}\n\r\t"),
255            (r#"escaped \"quotes\""#, "escaped \"quotes\""),
256            (r#"forward\/slash"#, "forward/slash"),
257            // Unicode escapes - Basic
258            (r#"\u0041\u0042\u0043"#, "ABC"),
259            (r#"Unicode: \u00A9 \u00AE"#, "Unicode: © ®"),
260            // Unicode escapes - Braces syntax
261            (r#"\u{0041}\u{0042}\u{0043}"#, "ABC"),
262            (r#"Unicode: \u{00A9} \u{00AE}"#, "Unicode: © ®"),
263            // Unicode escapes - Surrogate pairs
264            (r#"\uD834\uDD1E"#, "𝄞"),     // G-clef (musical symbol)
265            (r#"\u{D834}\u{DD1E}"#, "𝄞"), // Same with braces
266            // Mixed content
267            (r#"Mixed: \u0041\n\t\"test\""#, "Mixed: A\n\t\"test\""),
268            (r#"CJK: \u4E2D\u6587"#, "CJK: 中文"),
269            // Edge cases
270            (r#"\u007F"#, "\u{7F}"), // DEL character
271            (r#"\u0000"#, "\u{0}"),  // NULL character
272        ];
273
274        // Run all test cases
275        for (input, expected) in test_cases {
276            let input_bytes = input.as_bytes();
277            let mut idx = 0;
278            let result = parse_string(input_bytes, input_bytes.len(), &mut idx);
279
280            assert!(result.is_ok(), "Failed to parse valid string: {}", input);
281            assert_eq!(
282                result.unwrap(),
283                expected,
284                "Incorrect parsing result for: {}",
285                input
286            );
287            assert_eq!(
288                idx,
289                input_bytes.len(),
290                "Index not advanced correctly for: {}",
291                input
292            );
293        }
294
295        // Error cases
296        let error_cases = vec![
297            // Invalid escape sequence
298            r#"\z"#,
299            // Incomplete Unicode escape
300            r#"\u123"#,
301            // Invalid hex in Unicode escape
302            r#"\uGHIJ"#,
303        ];
304
305        for input in error_cases {
306            let input_bytes = if let Ok(s) = std::str::from_utf8(input.as_ref()) {
307                s.as_bytes()
308            } else {
309                input.as_ref()
310            };
311            let mut idx = 0;
312            let result = parse_string(input_bytes, input_bytes.len(), &mut idx);
313            assert!(
314                result.is_err(),
315                "Expected error for invalid input: {:?}",
316                input_bytes
317            );
318        }
319    }
320
321    proptest! {
322        /// Property-based test for parse_string using randomly generated strings
323        ///
324        /// This test generates:
325        /// 1. Regular ASCII strings
326        /// 2. Strings with escaped characters
327        /// 3. Strings with Unicode characters including CJK
328        /// 4. Strings with Unicode escape sequences
329        #[test]
330        fn proptest_parse_string(
331            // Generate regular ASCII strings
332            s1 in r#"[a-zA-Z0-9 ]{0,50}"#,
333            // Generate strings with standard JSON escape sequences
334            s2 in r#"(\\[\"\\\/bfnrt]){0,10}"#,
335            // Generate Unicode characters including CJK
336            s3 in prop::collection::vec(prop::char::range('\u{0020}', '\u{FFFF}'), 0..20).prop_map(|chars| chars.into_iter().collect::<String>()),
337            // Generate valid Unicode escape sequences
338            s4 in prop::collection::vec(0u16..0xD800, 0..5).prop_map(|nums| {
339                nums.into_iter()
340                    .fold(String::new(), |mut output, b| {
341                        let _ = write!(output, r#"\u{:04X}"#, b);
342                        output
343                    })
344            }),
345            // Generate valid Unicode surrogate pairs
346            s5 in prop::collection::vec((0xD800u16..0xDC00, 0xDC00u16..0xE000), 0..3).prop_map(|pairs| {
347                pairs.into_iter()
348                    .fold(String::new(), |mut output, (high, low)| {
349                        let _ = write!(output, r#"\u{:04X}\u{:04X}"#, high, low);
350                        output
351                    })
352            }),
353        ) {
354            // Combine all generated strings
355            let combined = format!("{}{}{}{}{}", s1, s2, s3, s4, s5);
356
357            // Skip empty strings as they're already tested in the unit tests
358            prop_assume!(!combined.is_empty());
359
360            // Convert to a properly escaped JSON string
361            let json_string = serde_json::to_string(&combined).unwrap();
362            // Remove the surrounding quotes that serde_json adds
363            let json_content = &json_string[1..json_string.len()-1];
364
365            // Parse the string using our function
366            let input_bytes = json_content.as_bytes();
367            let mut idx = 0;
368            let result = parse_string(input_bytes, input_bytes.len(), &mut idx);
369
370            // Verify parsing succeeded and produced the expected result
371            prop_assert!(result.is_ok(), "Failed to parse valid string: {}", json_content);
372            prop_assert_eq!(result.unwrap(), combined, "Incorrect parsing result");
373            prop_assert_eq!(idx, input_bytes.len(), "Index not advanced correctly");
374        }
375
376        /// Property-based test for parse_string with focus on edge cases
377        ///
378        /// This test specifically targets edge cases like:
379        /// 1. Strings with many escape sequences
380        /// 2. Very long strings
381        /// 3. Strings with complex Unicode patterns
382        #[test]
383        fn proptest_parse_string_edge_cases(
384            // Generate strings with many escape sequences
385            heavy_escapes in prop::collection::vec(
386                prop::sample::select(vec![r#"\\"#, r#"\""#, r#"\n"#, r#"\t"#, r#"\b"#, r#"\f"#, r#"\r"#, r#"\/"#, r#"\u0020"#, r#"\u00A9"#]),
387                1..100
388            ).prop_map(|v| v.join("")),
389
390            // Generate long regular strings
391            long_string in r#"[a-zA-Z0-9 ]{100,500}"#,
392
393            // Generate strings with repeating Unicode patterns
394            unicode_pattern in prop::collection::vec(
395                prop::sample::select(vec![
396                    // ASCII
397                    "ABC",
398                    // Emoji
399                    "😀😁😂",
400                    // CJK
401                    "中文日本語",
402                    // Mixed scripts
403                    "Latin Кириллица العربية",
404                    // Unicode escapes
405                    r#"\u0041\u0042\u0043"#,
406                    // Surrogate pairs
407                    r#"\uD834\uDD1E\uD834\uDD1F"#
408                ]),
409                1..10
410            ).prop_map(|v| v.join("")),
411        ) {
412            // Test each generated string separately
413            for test_str in [heavy_escapes, long_string, unicode_pattern] {
414                // Skip empty strings
415                if test_str.is_empty() {
416                    continue;
417                }
418
419                // Convert to a properly escaped JSON string
420                let json_string = serde_json::to_string(&test_str).unwrap();
421                // Remove the surrounding quotes
422                let json_content = &json_string[1..json_string.len()-1];
423
424                // Parse the string
425                let input_bytes = json_content.as_bytes();
426                let mut idx = 0;
427                let result = parse_string(input_bytes, input_bytes.len(), &mut idx);
428
429                // Verify parsing
430                prop_assert!(result.is_ok(), "Failed to parse valid string: {}", json_content);
431                prop_assert_eq!(result.unwrap(), test_str, "Incorrect parsing result");
432                prop_assert_eq!(idx, input_bytes.len(), "Index not advanced correctly");
433            }
434        }
435    }
436}