Skip to main content

nash_parse/
string.rs

1//! String literal parsing for Nash.
2//!
3//! Ported from Elm's `Parse/String.hs`.
4
5use crate::error::{Escape, StringError};
6use crate::{Col, Parser, Row};
7
8/// Internal result type for string parsing.
9enum StringResult<'a> {
10    Ok(&'a str),
11    Err(StringError, Row, Col),
12}
13
14impl<'a> Parser<'a> {
15    /// Parse a string literal with custom error constructors.
16    ///
17    /// Mirrors Elm's `String.string`:
18    /// ```haskell
19    /// string :: (Row -> Col -> x) -> (E.String -> Row -> Col -> x) -> Parser x ES.String
20    /// ```
21    ///
22    /// Handles both single-line (`"..."`) and multi-line (`"""..."""`) strings.
23    pub fn string_literal<E>(
24        &mut self,
25        to_expectation: impl FnOnce(Row, Col) -> E,
26        to_error: impl FnOnce(StringError, Row, Col) -> E,
27    ) -> Result<&'a str, E> {
28        let (row, col) = self.position();
29
30        // Must start with double quote
31        if self.peek() != Some(b'"') {
32            return Err(to_expectation(row, col));
33        }
34
35        self.advance(); // consume first "
36
37        // Check for multi-line string (""")
38        if self.peek() == Some(b'"') {
39            self.advance(); // consume second "
40
41            if self.peek() == Some(b'"') {
42                self.advance(); // consume third "
43                // Multi-line string
44                let result = self.chomp_multi_string();
45                match result {
46                    StringResult::Ok(s) => Ok(s),
47                    StringResult::Err(e, r, c) => Err(to_error(e, r, c)),
48                }
49            } else {
50                // Empty string ""
51                Ok(self.alloc_str(""))
52            }
53        } else {
54            // Single-line string
55            let result = self.chomp_single_string();
56            match result {
57                StringResult::Ok(s) => Ok(s),
58                StringResult::Err(e, r, c) => Err(to_error(e, r, c)),
59            }
60        }
61    }
62
63    /// Parse a single-line string (content after opening `"`).
64    fn chomp_single_string(&mut self) -> StringResult<'a> {
65        let start_pos = self.pos;
66        let (start_row, start_col) = self.position();
67        let mut needs_escape = false;
68
69        loop {
70            match self.peek() {
71                None => {
72                    // End of file without closing quote
73                    return StringResult::Err(StringError::EndlessSingle, start_row, start_col);
74                }
75                Some(b'\n') => {
76                    // Newline in single-line string
77                    return StringResult::Err(StringError::EndlessSingle, self.row(), self.col());
78                }
79                Some(b'"') => {
80                    // End of string
81                    let end_pos = self.pos;
82                    self.advance(); // consume closing "
83
84                    if needs_escape {
85                        // Build escaped string
86                        return self.build_escaped_string(start_pos, end_pos, false);
87                    } else {
88                        // Return slice directly
89                        let bytes = &self.src[start_pos..end_pos];
90                        // SAFETY: We've verified this is valid UTF-8 by scanning byte-by-byte
91                        let s = unsafe { std::str::from_utf8_unchecked(bytes) };
92                        return StringResult::Ok(s);
93                    }
94                }
95                Some(b'\\') => {
96                    needs_escape = true;
97                    self.advance(); // consume backslash
98
99                    match self.eat_escape() {
100                        EscapeResult::Normal(width) => {
101                            self.advance_by(width);
102                        }
103                        EscapeResult::Unicode(delta) => {
104                            self.advance_by(delta);
105                        }
106                        EscapeResult::Problem(escape) => {
107                            return StringResult::Err(
108                                StringError::Escape(escape),
109                                self.row(),
110                                self.col(),
111                            );
112                        }
113                        EscapeResult::EndOfFile => {
114                            return StringResult::Err(
115                                StringError::EndlessSingle,
116                                start_row,
117                                start_col,
118                            );
119                        }
120                    }
121                }
122                Some(b) => {
123                    // Regular character - advance by UTF-8 width
124                    let width = utf8_char_width(b);
125                    self.advance_by(width);
126                }
127            }
128        }
129    }
130
131    /// Parse a multi-line string (content after opening `"""`).
132    fn chomp_multi_string(&mut self) -> StringResult<'a> {
133        let start_pos = self.pos;
134        let (start_row, start_col) = self.position();
135        let mut needs_escape = false;
136
137        loop {
138            match self.peek() {
139                None => {
140                    return StringResult::Err(StringError::EndlessMulti, start_row, start_col);
141                }
142                Some(b'"') => {
143                    // Check for closing """
144                    if self.peek_at(1) == Some(b'"') && self.peek_at(2) == Some(b'"') {
145                        let end_pos = self.pos;
146                        self.advance_by(3); // consume closing """
147
148                        if needs_escape {
149                            return self.build_escaped_string(start_pos, end_pos, true);
150                        } else {
151                            let bytes = &self.src[start_pos..end_pos];
152                            let s = unsafe { std::str::from_utf8_unchecked(bytes) };
153                            return StringResult::Ok(s);
154                        }
155                    } else {
156                        self.advance();
157                    }
158                }
159                Some(b'\n') => {
160                    // Newlines are allowed in multi-line strings
161                    needs_escape = true; // We'll normalize to \n
162                    self.advance();
163                }
164                Some(b'\r') => {
165                    // Carriage return - skip it (normalize to just \n)
166                    needs_escape = true;
167                    self.advance();
168                }
169                Some(b'\\') => {
170                    needs_escape = true;
171                    self.advance();
172
173                    match self.eat_escape() {
174                        EscapeResult::Normal(width) => {
175                            self.advance_by(width);
176                        }
177                        EscapeResult::Unicode(delta) => {
178                            self.advance_by(delta);
179                        }
180                        EscapeResult::Problem(escape) => {
181                            return StringResult::Err(
182                                StringError::Escape(escape),
183                                self.row(),
184                                self.col(),
185                            );
186                        }
187                        EscapeResult::EndOfFile => {
188                            return StringResult::Err(
189                                StringError::EndlessMulti,
190                                start_row,
191                                start_col,
192                            );
193                        }
194                    }
195                }
196                Some(b) => {
197                    let width = utf8_char_width(b);
198                    self.advance_by(width);
199                }
200            }
201        }
202    }
203
204    /// Process escape sequences and build the final string.
205    fn build_escaped_string(&self, start: usize, end: usize, is_multi: bool) -> StringResult<'a> {
206        let mut result = String::new();
207        let mut pos = start;
208
209        while pos < end {
210            let b = self.src[pos];
211
212            if b == b'\\' {
213                pos += 1;
214                if pos >= end {
215                    break;
216                }
217
218                match self.src[pos] {
219                    b'n' => {
220                        result.push('\n');
221                        pos += 1;
222                    }
223                    b'r' => {
224                        result.push('\r');
225                        pos += 1;
226                    }
227                    b't' => {
228                        result.push('\t');
229                        pos += 1;
230                    }
231                    b'"' => {
232                        result.push('"');
233                        pos += 1;
234                    }
235                    b'\'' => {
236                        result.push('\'');
237                        pos += 1;
238                    }
239                    b'\\' => {
240                        result.push('\\');
241                        pos += 1;
242                    }
243                    b'u' => {
244                        pos += 1; // skip 'u'
245                        if pos < end && self.src[pos] == b'{' {
246                            pos += 1; // skip '{'
247                            let hex_start = pos;
248                            while pos < end && self.src[pos] != b'}' {
249                                pos += 1;
250                            }
251                            let hex_str =
252                                unsafe { std::str::from_utf8_unchecked(&self.src[hex_start..pos]) };
253                            if let Ok(code) = u32::from_str_radix(hex_str, 16)
254                                && let Some(c) = char::from_u32(code)
255                            {
256                                result.push(c);
257                            }
258                            pos += 1; // skip '}'
259                        }
260                    }
261                    _ => {
262                        pos += 1;
263                    }
264                }
265            } else if is_multi && b == b'\r' {
266                // Skip carriage returns in multi-line strings
267                pos += 1;
268            } else if is_multi && b == b'\n' {
269                result.push('\n');
270                pos += 1;
271            } else {
272                // Regular UTF-8 character
273                let width = utf8_char_width(b);
274                let char_bytes = &self.src[pos..pos + width];
275                let s = unsafe { std::str::from_utf8_unchecked(char_bytes) };
276                result.push_str(s);
277                pos += width;
278            }
279        }
280
281        StringResult::Ok(self.alloc_str(&result))
282    }
283
284    /// Parse an escape sequence after the backslash.
285    fn eat_escape(&self) -> EscapeResult {
286        match self.peek() {
287            None => EscapeResult::EndOfFile,
288            Some(b'n') | Some(b'r') | Some(b't') | Some(b'"') | Some(b'\'') | Some(b'\\') => {
289                EscapeResult::Normal(1)
290            }
291            Some(b'u') => self.eat_unicode(),
292            Some(_) => EscapeResult::Problem(Escape::Unknown),
293        }
294    }
295
296    /// Parse a unicode escape sequence `\u{...}`.
297    fn eat_unicode(&self) -> EscapeResult {
298        // Position is at 'u', need to check for '{'
299        if self.peek_at(1) != Some(b'{') {
300            return EscapeResult::Problem(Escape::BadUnicodeFormat(2));
301        }
302
303        // Count hex digits
304        let mut offset = 2; // past 'u{'
305        let mut num_digits = 0;
306        let mut code: u32 = 0;
307
308        loop {
309            match self.peek_at(offset) {
310                None => {
311                    return EscapeResult::Problem(Escape::BadUnicodeFormat(offset as u16));
312                }
313                Some(b'}') => {
314                    break;
315                }
316                Some(b) if b.is_ascii_hexdigit() => {
317                    let digit = if b.is_ascii_digit() {
318                        (b - b'0') as u32
319                    } else if (b'a'..=b'f').contains(&b) {
320                        (b - b'a' + 10) as u32
321                    } else {
322                        (b - b'A' + 10) as u32
323                    };
324                    code = code * 16 + digit;
325                    num_digits += 1;
326                    offset += 1;
327                }
328                Some(_) => {
329                    return EscapeResult::Problem(Escape::BadUnicodeFormat(offset as u16));
330                }
331            }
332        }
333
334        // Check code validity
335        if code > 0x10FFFF {
336            return EscapeResult::Problem(Escape::BadUnicodeCode((offset + 1) as u16));
337        }
338
339        // Check digit count (must be 4-6)
340        if !(4..=6).contains(&num_digits) {
341            return EscapeResult::Problem(Escape::BadUnicodeLength {
342                code: (offset + 1) as u16,
343                expected: if num_digits < 4 { 4 } else { 6 },
344                actual: num_digits,
345            });
346        }
347
348        // Return total length including 'u', '{', digits, '}'
349        EscapeResult::Unicode(offset + 1)
350    }
351}
352
353/// Result of parsing an escape sequence.
354enum EscapeResult {
355    /// Normal escape like \n, width is 1
356    Normal(usize),
357    /// Unicode escape \u{...}, delta is total chars consumed
358    Unicode(usize),
359    /// End of file during escape
360    EndOfFile,
361    /// Invalid escape
362    Problem(Escape),
363}
364
365/// Get the width of a UTF-8 character from its first byte.
366#[inline]
367fn utf8_char_width(b: u8) -> usize {
368    if b < 0x80 {
369        1
370    } else if b < 0xE0 {
371        2
372    } else if b < 0xF0 {
373        3
374    } else {
375        4
376    }
377}