core_json/
string.rs

1use core::marker::PhantomData;
2
3use crate::{BytesLike, String, Stack, JsonError};
4
5/// Peek a UTF-8 codepoint from bytes.
6fn peek_utf8<'bytes, B: BytesLike<'bytes>, S: Stack>(
7  bytes: &B,
8  i: usize,
9) -> Result<char, JsonError<'bytes, B, S>> {
10  let mut utf8_codepoint = [0; 4];
11  utf8_codepoint[0] = bytes.peek(i).map_err(JsonError::BytesError)?;
12  let utf8_codepoint_len = usize::from({
13    let first_bit_set = (utf8_codepoint[0] & (1 << 7)) != 0;
14    let third_bit_set = (utf8_codepoint[0] & (1 << 5)) != 0;
15    let fourth_bit_set = (utf8_codepoint[0] & (1 << 4)) != 0;
16    1u8 +
17      u8::from(first_bit_set) +
18      u8::from(first_bit_set & third_bit_set) +
19      u8::from(first_bit_set & third_bit_set & fourth_bit_set)
20  });
21  let utf8_codepoint = &mut utf8_codepoint[.. utf8_codepoint_len];
22  for (j, byte) in utf8_codepoint.iter_mut().enumerate().skip(1) {
23    *byte = bytes.peek(i + j).map_err(JsonError::BytesError)?;
24  }
25
26  let str = core::str::from_utf8(utf8_codepoint).map_err(|_| JsonError::InvalidValue)?;
27  str.chars().next().ok_or(JsonError::InternalError)
28}
29
30/// Read a just-opened string from a JSON serialization.
31pub(crate) fn read_string<'bytes, B: BytesLike<'bytes>, S: Stack>(
32  bytes: &mut B,
33) -> Result<String<'bytes, B>, JsonError<'bytes, B, S>> {
34  // Find the location of the terminating quote
35  let mut i = 0;
36  {
37    let mut escaping = false;
38    loop {
39      let this = peek_utf8(bytes, i)?;
40
41      // https://datatracker.ietf.org/doc/html/rfc8259#section-7
42      let unescaped =
43        matches!(this, '\x20' ..= '\x21' | '\x23' ..= '\x5b' | '\x5d' ..= '\u{10ffff}');
44
45      // If we're escaping the current character, check it's valid to be escaped
46      if escaping {
47        if !matches!(
48          this,
49          '\x22' | '\x5c' | '\x2f' | '\x62' | '\x66' | '\x6e' | '\x72' | '\x74' | '\x75'
50        ) {
51          Err(JsonError::InvalidValue)?;
52        }
53
54        // If this is "\u", check it's followed by hex characters
55        if (this == '\x75') &&
56          (!(peek_utf8(bytes, i + 1)?.is_ascii_hexdigit() &&
57            peek_utf8(bytes, i + 2)?.is_ascii_hexdigit() &&
58            peek_utf8(bytes, i + 3)?.is_ascii_hexdigit() &&
59            peek_utf8(bytes, i + 4)?.is_ascii_hexdigit()))
60        {
61          Err(JsonError::InvalidValue)?;
62        }
63      } else if this == '"' {
64        break;
65      }
66
67      if !(unescaped || escaping || (this == '\\')) {
68        Err(JsonError::InvalidValue)?;
69      }
70      escaping = (!escaping) && (this == '\\');
71      i += this.len_utf8();
72    }
73  }
74
75  let (len, str_bytes) = bytes.read_bytes(i).map_err(JsonError::BytesError)?;
76  // Advance past the closing `"`
77  bytes.advance(1).map_err(JsonError::BytesError)?;
78  Ok(String { len, bytes: str_bytes, _encoding: PhantomData })
79}
80
81/// An interator which yields the characters for an escaped string serialized within JSON.
82pub struct UnescapeString<'bytes, B: BytesLike<'bytes>, S: Stack> {
83  string: B,
84  remaining: usize,
85  _stack: PhantomData<(&'bytes (), S)>,
86}
87impl<'bytes, B: BytesLike<'bytes>, S: Stack> From<String<'bytes, B>>
88  for UnescapeString<'bytes, B, S>
89{
90  fn from(string: String<'bytes, B>) -> Self {
91    Self { remaining: string.len(), string: string.consume(), _stack: PhantomData }
92  }
93}
94impl<'bytes, B: BytesLike<'bytes>, S: Stack> Iterator for UnescapeString<'bytes, B, S> {
95  type Item = Result<char, JsonError<'bytes, B, S>>;
96  fn next(&mut self) -> Option<Self::Item> {
97    // Check if the string is empty
98    if self.remaining == 0 {
99      None?;
100    }
101
102    let res = (|| {
103      {
104        let next_char = peek_utf8(&self.string, 0)?;
105
106        let len = next_char.len_utf8();
107        // `InternalError`: `BytesLike` read past its declared length
108        self.remaining = self.remaining.checked_sub(len).ok_or(JsonError::InternalError)?;
109        self.string.advance(len).map_err(JsonError::BytesError)?;
110
111        // If this isn't an escape character, yield it
112        if next_char != '\\' {
113          return Ok(next_char);
114        }
115      }
116
117      // Definitions from https://datatracker.ietf.org/doc/html/rfc8259#section-7
118      match {
119        // `InternalError`: Escape character without following escaped values
120        self.remaining = self.remaining.checked_sub(1).ok_or(JsonError::InternalError)?;
121        self.string.read_byte().map_err(JsonError::BytesError)?
122      } {
123        // If this is to escape the intended character, yield it now
124        b'"' => Ok('"'),
125        b'\\' => Ok('\\'),
126        b'/' => Ok('/'),
127        // If this is to escape a control sequence, yield it now
128        b'b' => Ok('\x08'),
129        b'f' => Ok('\x0c'),
130        b'n' => Ok('\n'),
131        b'r' => Ok('\r'),
132        b't' => Ok('\t'),
133
134        // Handle if this is a unicode codepoint
135        b'u' => {
136          let mut read_hex = |with_u| {
137            if with_u {
138              let mut backslash_u = [0; 2];
139              self.string.read_into_slice(&mut backslash_u).map_err(JsonError::BytesError)?;
140              if &backslash_u != b"\\u" {
141                Err(JsonError::InvalidValue)?;
142              }
143            }
144
145            let mut hex = [0; 4];
146            self.string.read_into_slice(&mut hex).map_err(JsonError::BytesError)?;
147            // `InternalError`: `\u` without following 'hex' bytes being UTF-8
148            let hex = core::str::from_utf8(&hex).map_err(|_| JsonError::InternalError)?;
149            // `InternalError`: `\u` with UTF-8 bytes which weren't hex
150            u16::from_str_radix(hex, 16).map(u32::from).map_err(|_| JsonError::InternalError)
151          };
152
153          // Read the hex digits
154          // `InternalError`: `\u` without following hex bytes
155          self.remaining = self.remaining.checked_sub(4).ok_or(JsonError::InternalError)?;
156          let next = read_hex(false)?;
157
158          /*
159            If the intended value of this codepoint exceeds 0xffff, it's specified to be encoded
160            with its UTF-16 surrogate pair. We distinguish and fetch the second part if necessary
161            now. For the actual conversion algorithm from the UTF-16 surrogate pair to the UTF
162            codepoint, https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF_(surrogates) is
163            used as reference.
164          */
165          let next_is_utf16_high_surrogate = matches!(next, 0xd800 ..= 0xdbff);
166          let codepoint = if next_is_utf16_high_surrogate {
167            let high = (next - 0xd800) << 10;
168
169            // `InvalidValue`: Caller provided an incomplete code point
170            /*
171              https://datatracker.ietf.org/doc/html/rfc8259#section-8.2 notes how the syntax allows
172              an incomplete codepoint, further noting the behavior of implementations is
173              unpredictable. The definition of "interoperable" is if the strings are composed
174              entirely of Unicode characters, with an unpaired surrogate being considered as unable
175              to encode a Unicode character.
176
177              As Rust requires `char` be a UTF codepoint, we require the strings be "interoperable"
178              per the RFC 8259 definition. While this may be slightly stricter than the
179              specification alone, it already has plenty of ambiguities due to how many slight
180              differences exist with JSON encoders/decoders.
181
182              Additionally, we'll still decode JSON objects with invalidly specified UTF codepoints
183              within their strings. We just won't support converting them to characters with this
184              iterator. This iterator failing will not cause the deserializer as a whole to fail.
185            */
186            self.remaining = self.remaining.checked_sub(6).ok_or(JsonError::InvalidValue)?;
187            let low = read_hex(true)?;
188
189            let Some(low) = low.checked_sub(0xdc00) else { Err(JsonError::InvalidValue)? };
190            high + low + 0x10000
191          } else {
192            // If `next` isn't a surrogate, it's interpreted as a codepoint as-is
193            next
194          };
195
196          // Yield the codepoint
197          char::from_u32(codepoint).ok_or(JsonError::InvalidValue)
198        }
199        // `InternalError`: `\` without a recognized following character
200        _ => Err(JsonError::InternalError),
201      }
202    })();
203
204    // If the result was an error, set `remaining = 0` so all future calls to `next` yield `None`
205    if res.is_err() {
206      self.remaining = 0;
207    }
208
209    Some(res)
210  }
211}