1use std::borrow::Cow;
2
3pub struct ParseStringError {
4 pub byte_index: usize,
5 pub kind: ParseStringErrorKind,
6}
7
8#[derive(Debug, Clone, PartialEq, Eq, Hash)]
9pub enum ParseStringErrorKind {
10 InvalidEscapeInSingleQuoteString,
11 InvalidEscapeInDoubleQuoteString,
12 ExpectedFourHexDigits,
13 InvalidUnicodeEscapeSequence(String),
14 InvalidEscape,
15 UnterminatedStringLiteral,
16}
17
18impl std::error::Error for ParseStringErrorKind {}
19
20impl std::fmt::Display for ParseStringErrorKind {
21 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
22 match self {
23 ParseStringErrorKind::InvalidEscapeInSingleQuoteString => {
24 write!(f, "Invalid escape in single quote string")
25 }
26 ParseStringErrorKind::InvalidEscapeInDoubleQuoteString => {
27 write!(f, "Invalid escape in double quote string")
28 }
29 ParseStringErrorKind::ExpectedFourHexDigits => {
30 write!(f, "Expected four hex digits")
31 }
32 ParseStringErrorKind::InvalidUnicodeEscapeSequence(value) => {
33 write!(
34 f,
35 "Invalid unicode escape sequence. '{}' is not a valid UTF8 character",
36 value
37 )
38 }
39 ParseStringErrorKind::InvalidEscape => {
40 write!(f, "Invalid escape")
41 }
42 ParseStringErrorKind::UnterminatedStringLiteral => {
43 write!(f, "Unterminated string literal")
44 }
45 }
46 }
47}
48
49pub trait CharProvider<'a> {
50 fn current_char(&mut self) -> Option<char>;
51 fn byte_index(&self) -> usize;
52 fn move_next_char(&mut self) -> Option<char>;
53 fn text(&self) -> &'a str;
54}
55
56#[cfg(feature = "cst")]
57pub fn parse_string(text: &str) -> Result<Cow<'_, str>, ParseStringError> {
58 struct StringCharProvider<'a> {
59 text: &'a str,
60 byte_index: usize,
61 current_char: Option<char>,
62 chars: std::str::Chars<'a>,
63 }
64
65 impl<'a> CharProvider<'a> for StringCharProvider<'a> {
66 fn current_char(&mut self) -> Option<char> {
67 self.current_char
68 }
69
70 fn byte_index(&self) -> usize {
71 self.byte_index
72 }
73
74 fn move_next_char(&mut self) -> Option<char> {
75 if let Some(current_char) = self.current_char {
76 self.byte_index += current_char.len_utf8();
77 }
78 self.current_char = self.chars.next();
79 self.current_char
80 }
81
82 fn text(&self) -> &'a str {
83 self.text
84 }
85 }
86
87 let mut chars = text.chars();
88 let mut provider = StringCharProvider {
89 text,
90 byte_index: 0,
91 current_char: chars.next(),
92 chars,
93 };
94
95 parse_string_with_char_provider(&mut provider)
96}
97
98pub fn parse_string_with_char_provider<'a, T: CharProvider<'a>>(
99 chars: &mut T,
100) -> Result<Cow<'a, str>, ParseStringError> {
101 debug_assert!(
102 chars.current_char() == Some('\'') || chars.current_char() == Some('"'),
103 "Expected \", was {:?}",
104 chars.current_char()
105 );
106 let is_double_quote = chars.current_char() == Some('"');
107 let mut last_start_byte_index = chars.byte_index() + 1;
108 let mut text: Option<String> = None;
109 let mut last_was_backslash = false;
110 let mut found_end_string = false;
111 let token_start = chars.byte_index();
112
113 while let Some(current_char) = chars.move_next_char() {
114 if last_was_backslash {
115 let escape_start = chars.byte_index() - 1; match current_char {
117 '"' | '\'' | '\\' | '/' | 'b' | 'f' | 'u' | 'r' | 'n' | 't' => {
118 if current_char == '"' {
119 if !is_double_quote {
120 return Err(ParseStringError {
121 byte_index: escape_start,
122 kind: ParseStringErrorKind::InvalidEscapeInSingleQuoteString,
123 });
124 }
125 } else if current_char == '\'' && is_double_quote {
126 return Err(ParseStringError {
127 byte_index: escape_start,
128 kind: ParseStringErrorKind::InvalidEscapeInDoubleQuoteString,
129 });
130 }
131
132 let previous_text = &chars.text()[last_start_byte_index..escape_start];
133 if text.is_none() {
134 text = Some(String::new());
135 }
136 let text = text.as_mut().unwrap();
137 text.push_str(previous_text);
138 if current_char == 'u' {
139 let hex_char = parse_hex_char(chars).map_err(|kind| ParseStringError {
140 byte_index: escape_start,
141 kind,
142 })?;
143 text.push(hex_char);
144 last_start_byte_index = chars.byte_index() + chars.current_char().map(|c| c.len_utf8()).unwrap_or(0);
145 } else {
146 text.push(match current_char {
147 'b' => '\u{08}',
148 'f' => '\u{0C}',
149 't' => '\t',
150 'r' => '\r',
151 'n' => '\n',
152 _ => current_char,
153 });
154 last_start_byte_index = chars.byte_index() + current_char.len_utf8();
155 }
156 }
157 _ => {
158 return Err(ParseStringError {
159 byte_index: escape_start,
160 kind: ParseStringErrorKind::InvalidEscape,
161 });
162 }
163 }
164 last_was_backslash = false;
165 } else if is_double_quote && current_char == '"' || !is_double_quote && current_char == '\'' {
166 found_end_string = true;
167 break;
168 } else {
169 last_was_backslash = current_char == '\\';
170 }
171 }
172
173 if found_end_string {
174 chars.move_next_char();
175 let final_segment = &chars.text()[last_start_byte_index..chars.byte_index() - 1];
176 Ok(match text {
177 Some(mut text) => {
178 text.push_str(final_segment);
179 Cow::Owned(text)
180 }
181 None => Cow::Borrowed(final_segment),
182 })
183 } else {
184 Err(ParseStringError {
185 byte_index: token_start,
186 kind: ParseStringErrorKind::UnterminatedStringLiteral,
187 })
188 }
189}
190
191fn parse_hex_char<'a, T: CharProvider<'a>>(chars: &mut T) -> Result<char, ParseStringErrorKind> {
192 let mut hex_text = String::new();
193 for _ in 0..4 {
195 let current_char = chars.move_next_char();
196 if !is_hex(current_char) {
197 return Err(ParseStringErrorKind::ExpectedFourHexDigits);
198 }
199 if let Some(current_char) = current_char {
200 hex_text.push(current_char);
201 }
202 }
203
204 let hex_value = match u32::from_str_radix(&hex_text, 16) {
205 Ok(v) => v,
206 Err(_) => {
207 return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text));
208 }
209 };
210
211 let hex_char = if (0xD800..=0xDBFF).contains(&hex_value) {
213 let next_char = chars.move_next_char();
216 if next_char != Some('\\') {
217 return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
218 "{} (unpaired high surrogate)",
219 hex_text
220 )));
221 }
222
223 let next_char = chars.move_next_char();
224 if next_char != Some('u') {
225 return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
226 "{} (unpaired high surrogate)",
227 hex_text
228 )));
229 }
230
231 let mut hex_text2 = String::new();
233 for _ in 0..4 {
234 let current_char = chars.move_next_char();
235 if !is_hex(current_char) {
236 return Err(ParseStringErrorKind::ExpectedFourHexDigits);
237 }
238 if let Some(current_char) = current_char {
239 hex_text2.push(current_char);
240 }
241 }
242
243 let hex_value2 = match u32::from_str_radix(&hex_text2, 16) {
244 Ok(v) => v,
245 Err(_) => {
246 return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text2));
247 }
248 };
249
250 if !(0xDC00..=0xDFFF).contains(&hex_value2) {
252 return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
253 "{} (high surrogate not followed by low surrogate)",
254 hex_text
255 )));
256 }
257
258 let code_point = ((hex_value - 0xD800) * 0x400) + (hex_value2 - 0xDC00) + 0x10000;
260
261 match std::char::from_u32(code_point) {
262 Some(c) => c,
263 None => {
264 return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
265 "{}\\u{} (invalid surrogate pair)",
266 hex_text, hex_text2
267 )));
268 }
269 }
270 } else if (0xDC00..=0xDFFF).contains(&hex_value) {
271 return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(format!(
273 "{} (unpaired low surrogate)",
274 hex_text
275 )));
276 } else {
277 match std::char::from_u32(hex_value) {
279 Some(hex_char) => hex_char,
280 None => {
281 return Err(ParseStringErrorKind::InvalidUnicodeEscapeSequence(hex_text));
282 }
283 }
284 };
285 Ok(hex_char)
286}
287
288fn is_hex(c: Option<char>) -> bool {
289 let Some(c) = c else {
290 return false;
291 };
292 is_digit(c) || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
293}
294
295fn is_digit(c: char) -> bool {
296 c.is_ascii_digit()
297}