1use super::constants::*;
16use super::error::Error;
17use super::error::ParseErrorCode;
18
19#[allow(clippy::zero_prefixed_literal)]
20static HEX: [u8; 256] = {
21 const __: u8 = 255; [
23 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, 00, 01, 02, 03, 04, 05, 06, 07, 08, 09, __, __, __, __, __, __, __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, ]
41};
42
43pub fn parse_string(mut data: &[u8], len: usize, idx: &mut usize) -> Result<String, Error> {
44 let mut buf = Vec::with_capacity(len);
45 let mut str_buf = String::with_capacity(4);
46 while !data.is_empty() {
47 *idx += 1;
48 let byte = data[0];
49 if byte == b'\\' {
50 data = &data[1..];
51 data = parse_escaped_string(data, idx, &mut str_buf)?;
52 buf.extend_from_slice(str_buf.as_bytes());
53 str_buf.clear();
54 } else {
55 buf.push(byte);
56 data = &data[1..];
57 }
58 }
59 String::from_utf8(buf).map_err(|_| Error::Syntax(ParseErrorCode::InvalidStringValue, *idx))
60}
61
62fn parse_escaped_string<'a>(
63 mut data: &'a [u8],
64 idx: &mut usize,
65 str_buf: &mut String,
66) -> Result<&'a [u8], Error> {
67 if data.is_empty() {
68 return Err(Error::Syntax(
69 ParseErrorCode::UnexpectedEndOfHexEscape,
70 *idx,
71 ));
72 }
73
74 let byte = data[0];
75 *idx += 1;
76 data = &data[1..];
77 match byte {
78 b'\\' => str_buf.push(BS),
79 b'"' => str_buf.push(QU),
80 b'/' => str_buf.push(SD),
81 b'b' => str_buf.push(BB),
82 b'f' => str_buf.push(FF),
83 b'n' => str_buf.push(NN),
84 b'r' => str_buf.push(RR),
85 b't' => str_buf.push(TT),
86 b'u' => {
87 let mut numbers = [0u8; UNICODE_LEN];
88 data = parse_unicode_escape(data, idx, &mut numbers)?;
90 let hex = decode_hex_escape(&numbers, idx)?;
91
92 let c = match hex {
93 0xDC00..=0xDFFF => {
94 encode_invalid_unicode(&numbers, str_buf);
96 return Ok(data);
97 }
98
99 n1 @ 0xD800..=0xDBFF => {
102 if data.len() < 2 {
104 encode_invalid_unicode(&numbers, str_buf);
105 return Ok(data);
106 }
107
108 if data[0] == b'\\' && data[1] == b'u' {
110 *idx += 2;
111 data = &data[2..];
112 } else {
113 encode_invalid_unicode(&numbers, str_buf);
114 return Ok(data);
115 }
116
117 let mut lower_numbers = [0u8; UNICODE_LEN];
118 data = parse_unicode_escape(data, idx, &mut lower_numbers)?;
120 let n2 = decode_hex_escape(&lower_numbers, idx)?;
121 if !(0xDC00..=0xDFFF).contains(&n2) {
122 encode_invalid_unicode(&numbers, str_buf);
123 encode_invalid_unicode(&lower_numbers, str_buf);
124 return Ok(data);
125 }
126
127 #[allow(clippy::precedence)]
128 let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
129
130 match char::from_u32(n) {
131 Some(ch) => ch,
132 None => {
133 encode_invalid_unicode(&numbers, str_buf);
136 encode_invalid_unicode(&lower_numbers, str_buf);
137 return Ok(data);
138 }
139 }
140 }
141
142 n => match char::from_u32(n as u32) {
144 Some(ch) => ch,
145 None => {
146 encode_invalid_unicode(&numbers, str_buf);
148 return Ok(data);
149 }
150 },
151 };
152 str_buf.push(c);
153 }
154 other => return Err(Error::Syntax(ParseErrorCode::InvalidEscaped(other), *idx)),
155 }
156 Ok(data)
157}
158
159#[inline]
164fn parse_unicode_escape<'a>(
165 mut data: &'a [u8],
166 idx: &mut usize,
167 numbers: &mut [u8; UNICODE_LEN],
168) -> Result<&'a [u8], Error> {
169 if data.len() < UNICODE_LEN {
170 return Err(Error::Syntax(
171 ParseErrorCode::UnexpectedEndOfHexEscape,
172 *idx,
173 ));
174 }
175 if data[0] == b'{' {
177 if data.len() < UNICODE_LEN + 2 {
178 return Err(Error::Syntax(
179 ParseErrorCode::UnexpectedEndOfHexEscape,
180 *idx,
181 ));
182 }
183
184 numbers.copy_from_slice(&data[1..UNICODE_LEN + 1]);
185 if data[UNICODE_LEN + 1] != b'}' {
186 return Err(Error::Syntax(
187 ParseErrorCode::UnexpectedEndOfHexEscape,
188 *idx,
189 ));
190 }
191
192 data = &data[UNICODE_LEN + 2..];
193 *idx += UNICODE_LEN + 2;
194 } else {
195 numbers.copy_from_slice(&data[..UNICODE_LEN]);
197 data = &data[UNICODE_LEN..];
198 *idx += UNICODE_LEN;
199 }
200
201 Ok(data)
202}
203
204#[inline]
207fn encode_invalid_unicode(numbers: &[u8], str_buf: &mut String) {
208 str_buf.push('\\');
209 str_buf.push('u');
210 for n in numbers {
211 str_buf.push((*n).into());
212 }
213}
214
215#[inline]
216fn decode_hex_val(val: u8) -> Option<u16> {
217 let n = HEX[val as usize] as u16;
218 if n == 255 {
219 None
220 } else {
221 Some(n)
222 }
223}
224
225#[inline]
226fn decode_hex_escape(numbers: &[u8], idx: &usize) -> Result<u16, Error> {
227 let mut n = 0;
228 for number in numbers {
229 if let Some(hex) = decode_hex_val(*number) {
230 n = (n << 4) + hex;
231 } else {
232 return Err(Error::Syntax(ParseErrorCode::InvalidHex(*number), *idx));
233 }
234 }
235 Ok(n)
236}
237
238#[cfg(test)]
239mod tests {
240 use super::*;
241 use proptest::prelude::*;
242 use std::fmt::Write;
243
244 #[test]
245 fn test_parse_string() {
246 let test_cases = vec![
248 ("hello", "hello"),
250 ("", ""),
251 ("123", "123"),
252 (r#"hello\nworld"#, "hello\nworld"),
254 (r#"\"\\\b\f\n\r\t"#, "\"\\\u{8}\u{c}\n\r\t"),
255 (r#"escaped \"quotes\""#, "escaped \"quotes\""),
256 (r#"forward\/slash"#, "forward/slash"),
257 (r#"\u0041\u0042\u0043"#, "ABC"),
259 (r#"Unicode: \u00A9 \u00AE"#, "Unicode: © ®"),
260 (r#"\u{0041}\u{0042}\u{0043}"#, "ABC"),
262 (r#"Unicode: \u{00A9} \u{00AE}"#, "Unicode: © ®"),
263 (r#"\uD834\uDD1E"#, "𝄞"), (r#"\u{D834}\u{DD1E}"#, "𝄞"), (r#"Mixed: \u0041\n\t\"test\""#, "Mixed: A\n\t\"test\""),
268 (r#"CJK: \u4E2D\u6587"#, "CJK: 中文"),
269 (r#"\u007F"#, "\u{7F}"), (r#"\u0000"#, "\u{0}"), ];
273
274 for (input, expected) in test_cases {
276 let input_bytes = input.as_bytes();
277 let mut idx = 0;
278 let result = parse_string(input_bytes, input_bytes.len(), &mut idx);
279
280 assert!(result.is_ok(), "Failed to parse valid string: {}", input);
281 assert_eq!(
282 result.unwrap(),
283 expected,
284 "Incorrect parsing result for: {}",
285 input
286 );
287 assert_eq!(
288 idx,
289 input_bytes.len(),
290 "Index not advanced correctly for: {}",
291 input
292 );
293 }
294
295 let error_cases = vec![
297 r#"\z"#,
299 r#"\u123"#,
301 r#"\uGHIJ"#,
303 ];
304
305 for input in error_cases {
306 let input_bytes = if let Ok(s) = std::str::from_utf8(input.as_ref()) {
307 s.as_bytes()
308 } else {
309 input.as_ref()
310 };
311 let mut idx = 0;
312 let result = parse_string(input_bytes, input_bytes.len(), &mut idx);
313 assert!(
314 result.is_err(),
315 "Expected error for invalid input: {:?}",
316 input_bytes
317 );
318 }
319 }
320
321 proptest! {
322 #[test]
330 fn proptest_parse_string(
331 s1 in r#"[a-zA-Z0-9 ]{0,50}"#,
333 s2 in r#"(\\[\"\\\/bfnrt]){0,10}"#,
335 s3 in prop::collection::vec(prop::char::range('\u{0020}', '\u{FFFF}'), 0..20).prop_map(|chars| chars.into_iter().collect::<String>()),
337 s4 in prop::collection::vec(0u16..0xD800, 0..5).prop_map(|nums| {
339 nums.into_iter()
340 .fold(String::new(), |mut output, b| {
341 let _ = write!(output, r#"\u{:04X}"#, b);
342 output
343 })
344 }),
345 s5 in prop::collection::vec((0xD800u16..0xDC00, 0xDC00u16..0xE000), 0..3).prop_map(|pairs| {
347 pairs.into_iter()
348 .fold(String::new(), |mut output, (high, low)| {
349 let _ = write!(output, r#"\u{:04X}\u{:04X}"#, high, low);
350 output
351 })
352 }),
353 ) {
354 let combined = format!("{}{}{}{}{}", s1, s2, s3, s4, s5);
356
357 prop_assume!(!combined.is_empty());
359
360 let json_string = serde_json::to_string(&combined).unwrap();
362 let json_content = &json_string[1..json_string.len()-1];
364
365 let input_bytes = json_content.as_bytes();
367 let mut idx = 0;
368 let result = parse_string(input_bytes, input_bytes.len(), &mut idx);
369
370 prop_assert!(result.is_ok(), "Failed to parse valid string: {}", json_content);
372 prop_assert_eq!(result.unwrap(), combined, "Incorrect parsing result");
373 prop_assert_eq!(idx, input_bytes.len(), "Index not advanced correctly");
374 }
375
376 #[test]
383 fn proptest_parse_string_edge_cases(
384 heavy_escapes in prop::collection::vec(
386 prop::sample::select(vec![r#"\\"#, r#"\""#, r#"\n"#, r#"\t"#, r#"\b"#, r#"\f"#, r#"\r"#, r#"\/"#, r#"\u0020"#, r#"\u00A9"#]),
387 1..100
388 ).prop_map(|v| v.join("")),
389
390 long_string in r#"[a-zA-Z0-9 ]{100,500}"#,
392
393 unicode_pattern in prop::collection::vec(
395 prop::sample::select(vec![
396 "ABC",
398 "😀😁😂",
400 "中文日本語",
402 "Latin Кириллица العربية",
404 r#"\u0041\u0042\u0043"#,
406 r#"\uD834\uDD1E\uD834\uDD1F"#
408 ]),
409 1..10
410 ).prop_map(|v| v.join("")),
411 ) {
412 for test_str in [heavy_escapes, long_string, unicode_pattern] {
414 if test_str.is_empty() {
416 continue;
417 }
418
419 let json_string = serde_json::to_string(&test_str).unwrap();
421 let json_content = &json_string[1..json_string.len()-1];
423
424 let input_bytes = json_content.as_bytes();
426 let mut idx = 0;
427 let result = parse_string(input_bytes, input_bytes.len(), &mut idx);
428
429 prop_assert!(result.is_ok(), "Failed to parse valid string: {}", json_content);
431 prop_assert_eq!(result.unwrap(), test_str, "Incorrect parsing result");
432 prop_assert_eq!(idx, input_bytes.len(), "Index not advanced correctly");
433 }
434 }
435 }
436}