nash_parse/string.rs
1//! String literal parsing for Nash.
2//!
3//! Ported from Elm's `Parse/String.hs`.
4
5use crate::error::{Escape, StringError};
6use crate::{Col, Parser, Row};
7
8/// Internal result type for string parsing.
9enum StringResult<'a> {
10 Ok(&'a str),
11 Err(StringError, Row, Col),
12}
13
14impl<'a> Parser<'a> {
15 /// Parse a string literal with custom error constructors.
16 ///
17 /// Mirrors Elm's `String.string`:
18 /// ```haskell
19 /// string :: (Row -> Col -> x) -> (E.String -> Row -> Col -> x) -> Parser x ES.String
20 /// ```
21 ///
22 /// Handles both single-line (`"..."`) and multi-line (`"""..."""`) strings.
23 pub fn string_literal<E>(
24 &mut self,
25 to_expectation: impl FnOnce(Row, Col) -> E,
26 to_error: impl FnOnce(StringError, Row, Col) -> E,
27 ) -> Result<&'a str, E> {
28 let (row, col) = self.position();
29
30 // Must start with double quote
31 if self.peek() != Some(b'"') {
32 return Err(to_expectation(row, col));
33 }
34
35 self.advance(); // consume first "
36
37 // Check for multi-line string (""")
38 if self.peek() == Some(b'"') {
39 self.advance(); // consume second "
40
41 if self.peek() == Some(b'"') {
42 self.advance(); // consume third "
43 // Multi-line string
44 let result = self.chomp_multi_string();
45 match result {
46 StringResult::Ok(s) => Ok(s),
47 StringResult::Err(e, r, c) => Err(to_error(e, r, c)),
48 }
49 } else {
50 // Empty string ""
51 Ok(self.alloc_str(""))
52 }
53 } else {
54 // Single-line string
55 let result = self.chomp_single_string();
56 match result {
57 StringResult::Ok(s) => Ok(s),
58 StringResult::Err(e, r, c) => Err(to_error(e, r, c)),
59 }
60 }
61 }
62
63 /// Parse a single-line string (content after opening `"`).
64 fn chomp_single_string(&mut self) -> StringResult<'a> {
65 let start_pos = self.pos;
66 let (start_row, start_col) = self.position();
67 let mut needs_escape = false;
68
69 loop {
70 match self.peek() {
71 None => {
72 // End of file without closing quote
73 return StringResult::Err(StringError::EndlessSingle, start_row, start_col);
74 }
75 Some(b'\n') => {
76 // Newline in single-line string
77 return StringResult::Err(StringError::EndlessSingle, self.row(), self.col());
78 }
79 Some(b'"') => {
80 // End of string
81 let end_pos = self.pos;
82 self.advance(); // consume closing "
83
84 if needs_escape {
85 // Build escaped string
86 return self.build_escaped_string(start_pos, end_pos, false);
87 } else {
88 // Return slice directly
89 let bytes = &self.src[start_pos..end_pos];
90 // SAFETY: We've verified this is valid UTF-8 by scanning byte-by-byte
91 let s = unsafe { std::str::from_utf8_unchecked(bytes) };
92 return StringResult::Ok(s);
93 }
94 }
95 Some(b'\\') => {
96 needs_escape = true;
97 self.advance(); // consume backslash
98
99 match self.eat_escape() {
100 EscapeResult::Normal(width) => {
101 self.advance_by(width);
102 }
103 EscapeResult::Unicode(delta) => {
104 self.advance_by(delta);
105 }
106 EscapeResult::Problem(escape) => {
107 return StringResult::Err(
108 StringError::Escape(escape),
109 self.row(),
110 self.col(),
111 );
112 }
113 EscapeResult::EndOfFile => {
114 return StringResult::Err(
115 StringError::EndlessSingle,
116 start_row,
117 start_col,
118 );
119 }
120 }
121 }
122 Some(b) => {
123 // Regular character - advance by UTF-8 width
124 let width = utf8_char_width(b);
125 self.advance_by(width);
126 }
127 }
128 }
129 }
130
131 /// Parse a multi-line string (content after opening `"""`).
132 fn chomp_multi_string(&mut self) -> StringResult<'a> {
133 let start_pos = self.pos;
134 let (start_row, start_col) = self.position();
135 let mut needs_escape = false;
136
137 loop {
138 match self.peek() {
139 None => {
140 return StringResult::Err(StringError::EndlessMulti, start_row, start_col);
141 }
142 Some(b'"') => {
143 // Check for closing """
144 if self.peek_at(1) == Some(b'"') && self.peek_at(2) == Some(b'"') {
145 let end_pos = self.pos;
146 self.advance_by(3); // consume closing """
147
148 if needs_escape {
149 return self.build_escaped_string(start_pos, end_pos, true);
150 } else {
151 let bytes = &self.src[start_pos..end_pos];
152 let s = unsafe { std::str::from_utf8_unchecked(bytes) };
153 return StringResult::Ok(s);
154 }
155 } else {
156 self.advance();
157 }
158 }
159 Some(b'\n') => {
160 // Newlines are allowed in multi-line strings
161 needs_escape = true; // We'll normalize to \n
162 self.advance();
163 }
164 Some(b'\r') => {
165 // Carriage return - skip it (normalize to just \n)
166 needs_escape = true;
167 self.advance();
168 }
169 Some(b'\\') => {
170 needs_escape = true;
171 self.advance();
172
173 match self.eat_escape() {
174 EscapeResult::Normal(width) => {
175 self.advance_by(width);
176 }
177 EscapeResult::Unicode(delta) => {
178 self.advance_by(delta);
179 }
180 EscapeResult::Problem(escape) => {
181 return StringResult::Err(
182 StringError::Escape(escape),
183 self.row(),
184 self.col(),
185 );
186 }
187 EscapeResult::EndOfFile => {
188 return StringResult::Err(
189 StringError::EndlessMulti,
190 start_row,
191 start_col,
192 );
193 }
194 }
195 }
196 Some(b) => {
197 let width = utf8_char_width(b);
198 self.advance_by(width);
199 }
200 }
201 }
202 }
203
204 /// Process escape sequences and build the final string.
205 fn build_escaped_string(&self, start: usize, end: usize, is_multi: bool) -> StringResult<'a> {
206 let mut result = String::new();
207 let mut pos = start;
208
209 while pos < end {
210 let b = self.src[pos];
211
212 if b == b'\\' {
213 pos += 1;
214 if pos >= end {
215 break;
216 }
217
218 match self.src[pos] {
219 b'n' => {
220 result.push('\n');
221 pos += 1;
222 }
223 b'r' => {
224 result.push('\r');
225 pos += 1;
226 }
227 b't' => {
228 result.push('\t');
229 pos += 1;
230 }
231 b'"' => {
232 result.push('"');
233 pos += 1;
234 }
235 b'\'' => {
236 result.push('\'');
237 pos += 1;
238 }
239 b'\\' => {
240 result.push('\\');
241 pos += 1;
242 }
243 b'u' => {
244 pos += 1; // skip 'u'
245 if pos < end && self.src[pos] == b'{' {
246 pos += 1; // skip '{'
247 let hex_start = pos;
248 while pos < end && self.src[pos] != b'}' {
249 pos += 1;
250 }
251 let hex_str =
252 unsafe { std::str::from_utf8_unchecked(&self.src[hex_start..pos]) };
253 if let Ok(code) = u32::from_str_radix(hex_str, 16)
254 && let Some(c) = char::from_u32(code)
255 {
256 result.push(c);
257 }
258 pos += 1; // skip '}'
259 }
260 }
261 _ => {
262 pos += 1;
263 }
264 }
265 } else if is_multi && b == b'\r' {
266 // Skip carriage returns in multi-line strings
267 pos += 1;
268 } else if is_multi && b == b'\n' {
269 result.push('\n');
270 pos += 1;
271 } else {
272 // Regular UTF-8 character
273 let width = utf8_char_width(b);
274 let char_bytes = &self.src[pos..pos + width];
275 let s = unsafe { std::str::from_utf8_unchecked(char_bytes) };
276 result.push_str(s);
277 pos += width;
278 }
279 }
280
281 StringResult::Ok(self.alloc_str(&result))
282 }
283
284 /// Parse an escape sequence after the backslash.
285 fn eat_escape(&self) -> EscapeResult {
286 match self.peek() {
287 None => EscapeResult::EndOfFile,
288 Some(b'n') | Some(b'r') | Some(b't') | Some(b'"') | Some(b'\'') | Some(b'\\') => {
289 EscapeResult::Normal(1)
290 }
291 Some(b'u') => self.eat_unicode(),
292 Some(_) => EscapeResult::Problem(Escape::Unknown),
293 }
294 }
295
296 /// Parse a unicode escape sequence `\u{...}`.
297 fn eat_unicode(&self) -> EscapeResult {
298 // Position is at 'u', need to check for '{'
299 if self.peek_at(1) != Some(b'{') {
300 return EscapeResult::Problem(Escape::BadUnicodeFormat(2));
301 }
302
303 // Count hex digits
304 let mut offset = 2; // past 'u{'
305 let mut num_digits = 0;
306 let mut code: u32 = 0;
307
308 loop {
309 match self.peek_at(offset) {
310 None => {
311 return EscapeResult::Problem(Escape::BadUnicodeFormat(offset as u16));
312 }
313 Some(b'}') => {
314 break;
315 }
316 Some(b) if b.is_ascii_hexdigit() => {
317 let digit = if b.is_ascii_digit() {
318 (b - b'0') as u32
319 } else if (b'a'..=b'f').contains(&b) {
320 (b - b'a' + 10) as u32
321 } else {
322 (b - b'A' + 10) as u32
323 };
324 code = code * 16 + digit;
325 num_digits += 1;
326 offset += 1;
327 }
328 Some(_) => {
329 return EscapeResult::Problem(Escape::BadUnicodeFormat(offset as u16));
330 }
331 }
332 }
333
334 // Check code validity
335 if code > 0x10FFFF {
336 return EscapeResult::Problem(Escape::BadUnicodeCode((offset + 1) as u16));
337 }
338
339 // Check digit count (must be 4-6)
340 if !(4..=6).contains(&num_digits) {
341 return EscapeResult::Problem(Escape::BadUnicodeLength {
342 code: (offset + 1) as u16,
343 expected: if num_digits < 4 { 4 } else { 6 },
344 actual: num_digits,
345 });
346 }
347
348 // Return total length including 'u', '{', digits, '}'
349 EscapeResult::Unicode(offset + 1)
350 }
351}
352
353/// Result of parsing an escape sequence.
354enum EscapeResult {
355 /// Normal escape like \n, width is 1
356 Normal(usize),
357 /// Unicode escape \u{...}, delta is total chars consumed
358 Unicode(usize),
359 /// End of file during escape
360 EndOfFile,
361 /// Invalid escape
362 Problem(Escape),
363}
364
365/// Get the width of a UTF-8 character from its first byte.
366#[inline]
367fn utf8_char_width(b: u8) -> usize {
368 if b < 0x80 {
369 1
370 } else if b < 0xE0 {
371 2
372 } else if b < 0xF0 {
373 3
374 } else {
375 4
376 }
377}