jsonc_parser/
string.rs

1use std::borrow::Cow;
2
3pub struct ParseStringError {
4  pub byte_index: usize,
5  pub kind: ParseStringErrorKind,
6}
7
8#[derive(Debug, Clone, PartialEq, Eq, Hash)]
9pub enum ParseStringErrorKind {
10  InvalidEscapeInSingleQuoteString,
11  InvalidEscapeInDoubleQuoteString,
12  ExpectedFourHexDigits,
13  InvalidUnicodeEscapeSequence(String),
14  InvalidEscape,
15  UnterminatedStringLiteral,
16}
17
18impl std::error::Error for ParseStringErrorKind {}
19
20impl std::fmt::Display for ParseStringErrorKind {
21  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
22    match self {
23      ParseStringErrorKind::InvalidEscapeInSingleQuoteString => {
24        write!(f, "Invalid escape in single quote string")
25      }
26      ParseStringErrorKind::InvalidEscapeInDoubleQuoteString => {
27        write!(f, "Invalid escape in double quote string")
28      }
29      ParseStringErrorKind::ExpectedFourHexDigits => {
30        write!(f, "Expected four hex digits")
31      }
32      ParseStringErrorKind::InvalidUnicodeEscapeSequence(value) => {
33        write!(
34          f,
35          "Invalid unicode escape sequence. '{}' is not a valid UTF8 character",
36          value
37        )
38      }
39      ParseStringErrorKind::InvalidEscape => {
40        write!(f, "Invalid escape")
41      }
42      ParseStringErrorKind::UnterminatedStringLiteral => {
43        write!(f, "Unterminated string literal")
44      }
45    }
46  }
47}
48
49pub trait CharProvider<'a> {
50  fn current_char(&mut self) -> Option<char>;
51  fn byte_index(&self) -> usize;
52  fn move_next_char(&mut self) -> Option<char>;
53  fn text(&self) -> &'a str;
54}
55
56#[cfg(feature = "cst")]
57pub fn parse_string(text: &str) -> Result<Cow<'_, str>, ParseStringError> {
58  struct StringCharProvider<'a> {
59    text: &'a str,
60    byte_index: usize,
61    current_char: Option<char>,
62    chars: std::str::Chars<'a>,
63  }
64
65  impl<'a> CharProvider<'a> for StringCharProvider<'a> {
66    fn current_char(&mut self) -> Option<char> {
67      self.current_char
68    }
69
70    fn byte_index(&self) -> usize {
71      self.byte_index
72    }
73
74    fn move_next_char(&mut self) -> Option<char> {
75      if let Some(current_char) = self.current_char {
76        self.byte_index += current_char.len_utf8();
77      }
78      self.current_char = self.chars.next();
79      self.current_char
80    }
81
82    fn text(&self) -> &'a str {
83      self.text
84    }
85  }
86
87  let mut chars = text.chars();
88  let mut provider = StringCharProvider {
89    text,
90    byte_index: 0,
91    current_char: chars.next(),
92    chars,
93  };
94
95  parse_string_with_char_provider(&mut provider)
96}
97
98pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>(
99  chars: &mut T,
100) -> Result<Cow<'a, str>, ParseStringError> {
101  debug_assert!(
102    chars.current_char() == Some('\'') || chars.current_char() == Some('"'),
103    "Expected \", was {:?}",
104    chars.current_char()
105  );
106  let is_double_quote = chars.current_char() == Some('"');
107  let mut last_start_byte_index = chars.byte_index() + 1;
108  let mut text: Option<String> = None;
109  let mut last_was_backslash = false;
110  let mut found_end_string = false;
111  let token_start = chars.byte_index();
112
113  while let Some(current_char) = chars.move_next_char() {
114    if last_was_backslash {
115      let escape_start = chars.byte_index() - 1; // -1 for backslash
116      match current_char {
117        '"' | '\'' | '\\' | '/' | 'b' | 'f' | 'u' | 'r' | 'n' | 't' => {
118          if current_char == '"' {
119            if !is_double_quote {
120              return Err(ParseStringError {
121                byte_index: escape_start,
122                kind: ParseStringErrorKind::InvalidEscapeInSingleQuoteString,
123              });
124            }
125          } else if current_char == '\'' && is_double_quote {
126            return Err(ParseStringError {
127              byte_index: escape_start,
128              kind: ParseStringErrorKind::InvalidEscapeInDoubleQuoteString,
129            });
130          }
131
132          let previous_text = &chars.text()[last_start_byte_index..escape_start];
133          if text.is_none() {
134            text = Some(String::new());
135          }
136          let text = text.as_mut().unwrap();
137          text.push_str(previous_text);
138          if current_char == 'u' {
139            let hex_char = parse_hex_char(chars).map_err(|kind| ParseStringError {
140              byte_index: escape_start,
141              kind,
142            })?;
143            text.push(hex_char);
144            last_start_byte_index = chars.byte_index() + chars.current_char().map(|c| c.len_utf8()).unwrap_or(0);
145          } else {
146            text.push(match current_char {
147              'b' => '\u{08}',
148              'f' => '\u{0C}',
149              't' => '\t',
150              'r' => '\r',
151              'n' => '\n',
152              _ => current_char,
153            });
154            last_start_byte_index = chars.byte_index() + current_char.len_utf8();
155          }
156        }
157        _ => {
158          return Err(ParseStringError {
159            byte_index: escape_start,
160            kind: ParseStringErrorKind::InvalidEscape,
161          });
162        }
163      }
164      last_was_backslash = false;
165    } else if is_double_quote && current_char == '"' || !is_double_quote && current_char == '\'' {
166      found_end_string = true;
167      break;
168    } else {
169      last_was_backslash = current_char == '\\';
170    }
171  }
172
173  if found_end_string {
174    chars.move_next_char();
175    let final_segment = &chars.text()[last_start_byte_index..chars.byte_index() - 1];
176    Ok(match text {
177      Some(mut text) => {
178        text.push_str(final_segment);
179        Cow::Owned(text)
180      }
181      None => Cow::Borrowed(final_segment),
182    })
183  } else {
184    Err(ParseStringError {
185      byte_index: token_start,
186      kind: ParseStringErrorKind::UnterminatedStringLiteral,
187    })
188  }
189}
190
191fn parse_hex_char<'a, T: CharProvider<'a>>(chars: &mut T) -> Result<char, ParseStringErrorKind> {
192  let mut hex_text = String::new();
193  // expect four hex values
194  for _ in 0..4 {
195    let current_char = chars.move_next_char();
196    if !is_hex(current_char) {
197      return Err(ParseStringErrorKind::ExpectedFourHexDigits);
198    }
199    if let Some(current_char) = current_char {
200      hex_text.push(current_char);
201    }
202  }
203
204  let hex_value = match u32::from_str_radix(&hex_text, 16) {
205    Ok(v) => v,
206    Err(_) => {
207      return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text));
208    }
209  };
210
211  // Check if this is a high surrogate (0xD800-0xDBFF)
212  let hex_char = if (0xD800..=0xDBFF).contains(&hex_value) {
213    // High surrogate - must be followed by low surrogate
214    // Peek ahead for \uXXXX pattern
215    let next_char = chars.move_next_char();
216    if next_char != Some('\\') {
217      return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
218        "{} (unpaired high surrogate)",
219        hex_text
220      )));
221    }
222
223    let next_char = chars.move_next_char();
224    if next_char != Some('u') {
225      return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
226        "{} (unpaired high surrogate)",
227        hex_text
228      )));
229    }
230
231    // Parse the second \uXXXX
232    let mut hex_text2 = String::new();
233    for _ in 0..4 {
234      let current_char = chars.move_next_char();
235      if !is_hex(current_char) {
236        return Err(ParseStringErrorKind::ExpectedFourHexDigits);
237      }
238      if let Some(current_char) = current_char {
239        hex_text2.push(current_char);
240      }
241    }
242
243    let hex_value2 = match u32::from_str_radix(&hex_text2, 16) {
244      Ok(v) => v,
245      Err(_) => {
246        return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text2));
247      }
248    };
249
250    // Verify it's a low surrogate (0xDC00-0xDFFF)
251    if !(0xDC00..=0xDFFF).contains(&hex_value2) {
252      return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
253        "{} (high surrogate not followed by low surrogate)",
254        hex_text
255      )));
256    }
257
258    // Combine surrogate pair using RFC 8259 formula
259    let code_point = ((hex_value - 0xD800) * 0x400) + (hex_value2 - 0xDC00) + 0x10000;
260
261    match std::char::from_u32(code_point) {
262      Some(c) => c,
263      None => {
264        return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
265          "{}\\u{} (invalid surrogate pair)",
266          hex_text, hex_text2
267        )));
268      }
269    }
270  } else if (0xDC00..=0xDFFF).contains(&hex_value) {
271    // Low surrogate without high surrogate
272    return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
273      "{} (unpaired low surrogate)",
274      hex_text
275    )));
276  } else {
277    // Normal unicode escape
278    match std::char::from_u32(hex_value) {
279      Some(hex_char) => hex_char,
280      None => {
281        return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text));
282      }
283    }
284  };
285  Ok(hex_char)
286}
287
288fn is_hex(c: Option<char>) -> bool {
289  let Some(c) = c else {
290    return false;
291  };
292  is_digit(c) || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
293}
294
295fn is_digit(c: char) -> bool {
296  c.is_ascii_digit()
297}