Skip to main content

hocon/
lexer.rs

1use crate::error::ParseError;
2
3#[derive(Debug, Clone, PartialEq)]
4pub enum TokenKind {
5    LBrace,
6    RBrace,
7    LBracket,
8    RBracket,
9    Comma,
10    Colon,
11    Equals,
12    PlusEquals,
13    Newline,
14    QuotedString,
15    TripleQuotedString,
16    Unquoted,
17    Substitution,
18    Eof,
19}
20
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub struct Segment {
23    pub text: String,
24    pub line: usize,
25    pub col: usize,
26}
27
28/// Payload carried by a `${...}` or `${?...}` substitution token.
29///
30/// `#[non_exhaustive]` ensures that adding new fields here (e.g. future spec
31/// extensions) does not break downstream crates that pattern-match or
32/// construct this struct.
33#[non_exhaustive]
34#[derive(Debug, Clone)]
35pub struct SubstPayload {
36    pub segments: Vec<Segment>,
37    pub optional: bool,
38    /// True when the substitution body carries a `[]` suffix, signalling
39    /// env-var-list expansion (`${X[]}` / `${?X[]}`).
40    pub list_suffix: bool,
41}
42
43/// A single token produced by the lexer.
44///
45/// `Token` is publicly re-exported as `hocon::Token` for the narrow surface
46/// that integration tests and diagnostic tooling need (per the advisory in
47/// `lib.rs`). It is marked `#[non_exhaustive]`: downstream code MUST NOT
48/// construct `Token` via struct-literal syntax and should treat it as
49/// inspect-only. This frees the lexer to add new metadata fields (e.g.
50/// `preceding_whitespace` in v1.5.3) without further source breaks.
51#[derive(Debug, Clone)]
52#[non_exhaustive]
53pub struct Token {
54    pub kind: TokenKind,
55    pub value: String,
56    pub line: usize,
57    pub col: usize,
58    #[allow(dead_code)]
59    pub is_quoted: bool,
60    /// True if preceded by whitespace OR a comment (concat detection, S10.5 / S10.8).
61    pub preceding_space: bool,
62    /// Literal preceding-whitespace chars consumed since the previous token.
63    /// Used by `parse_key` to preserve path-expression whitespace per E13 — for
64    /// `a b. c = 1` the ' ' before `c` becomes a leading-space prefix on the
65    /// post-dot segment.
66    ///
67    /// Note: `preceding_space` may be true while `preceding_whitespace` is empty
68    /// when the token is preceded only by a comment (no literal WS chars). The
69    /// boolean is the right signal for concat detection; the string is the right
70    /// signal for path-WS preservation. The comment-only shape fires for the
71    /// `newline` token emitted after `// foo\n` / `# foo\n`; non-newline tokens
72    /// participating in concat / path-WS contexts are always either preceded by
73    /// literal WS chars OR follow a newline that resets the buffer.
74    pub preceding_whitespace: String,
75    pub subst: Option<SubstPayload>,
76}
77
78/// Returns true for every character in the HOCON whitespace set.
79///
80/// The set is defined by Lightbend HOCON.md §Whitespace (L165-184) as:
81///   Java Character.isWhitespace set
82///   ∪ { U+00A0, U+2007, U+202F }  (NBSP variants Java excludes)
83///   ∪ { U+FEFF }                  (BOM)
84///
85/// Expanded:
86///   ASCII:  0x09 (TAB), 0x0A (LF), 0x0B (VTAB), 0x0C (FF), 0x0D (CR),
87///           0x1C (FS), 0x1D (GS), 0x1E (RS), 0x1F (US)
88///   Zs:     0x20, 0x00A0, 0x1680, 0x2000-0x200A, 0x202F, 0x205F, 0x3000
89///   Zl:     0x2028
90///   Zp:     0x2029
91///   BOM:    0xFEFF
92///
93/// NOTE: U+000A (LF) is included here because it is in the Java
94/// Character.isWhitespace set.  Callers that need to distinguish newline from
95/// inter-token whitespace must call is_hocon_newline first.
96pub(crate) fn is_hocon_whitespace(ch: char) -> bool {
97    matches!(ch,
98        '\t' | '\n' | '\u{000B}' | '\u{000C}' | '\r'
99      | '\u{001C}'..='\u{001F}'
100      | ' ' | '\u{00A0}' | '\u{FEFF}'
101      | '\u{1680}'
102      | '\u{2000}'..='\u{200A}'
103      | '\u{2028}' | '\u{2029}' | '\u{202F}' | '\u{205F}'
104      | '\u{3000}'
105    )
106}
107
108/// Returns true if `ch` is the HOCON newline character (ASCII LF, U+000A only).
109///
110/// Per HOCON.md L182-184: "newline refers only and specifically to ASCII
111/// newline 0x000A".  Unicode line/paragraph separators (U+2028, U+2029) are
112/// whitespace but NOT newlines.
113fn is_hocon_newline(ch: char) -> bool {
114    ch == '\n'
115}
116
117pub fn tokenize(input: &str) -> Result<Vec<Token>, ParseError> {
118    let chars: Vec<char> = input.chars().collect();
119    let mut tokens = Vec::new();
120    let mut pos = 0usize;
121    let mut line = 1usize;
122    let mut col = 1usize;
123    let mut had_space = false;
124    // E13 — accumulates literal whitespace chars consumed between tokens.
125    // Drained (via std::mem::take) on every token push. Comment text is NOT
126    // accumulated; only the actual WS chars.
127    let mut whitespace_buffer = String::new();
128
129    // Strip UTF-8 BOM
130    if !chars.is_empty() && chars[0] == '\u{FEFF}' {
131        pos = 1;
132    }
133
134    let peek =
135        |pos: usize, offset: usize| -> char { chars.get(pos + offset).copied().unwrap_or('\0') };
136
137    while pos < chars.len() {
138        let sl = line;
139        let sc = col;
140        let ch = chars[pos];
141
142        // Newline (must be checked before general whitespace because
143        // is_hocon_whitespace also returns true for LF — see spec §D).
144        if is_hocon_newline(ch) {
145            pos += 1;
146            line += 1;
147            col = 1;
148            if tokens
149                .last()
150                .is_none_or(|t: &Token| t.kind != TokenKind::Newline)
151            {
152                tokens.push(Token {
153                    kind: TokenKind::Newline,
154                    value: "\n".into(),
155                    line: sl,
156                    col: sc,
157                    is_quoted: false,
158                    preceding_space: had_space,
159                    preceding_whitespace: std::mem::take(&mut whitespace_buffer),
160                    subst: None,
161                });
162                had_space = false;
163            }
164            continue;
165        }
166
167        // Whitespace (not newline) — full HOCON_WS set per spec L165-184.
168        if is_hocon_whitespace(ch) {
169            whitespace_buffer.push(ch);
170            pos += 1;
171            col += 1;
172            had_space = true;
173            continue;
174        }
175
176        // Comments
177        if ch == '/' && peek(pos, 1) == '/' {
178            while pos < chars.len() && chars[pos] != '\n' {
179                pos += 1;
180                col += 1;
181            }
182            had_space = true;
183            continue;
184        }
185        if ch == '#' {
186            while pos < chars.len() && chars[pos] != '\n' {
187                pos += 1;
188                col += 1;
189            }
190            had_space = true;
191            continue;
192        }
193
194        // Single-char punctuation
195        let single_kind = match ch {
196            '{' => Some(TokenKind::LBrace),
197            '}' => Some(TokenKind::RBrace),
198            '[' => Some(TokenKind::LBracket),
199            ']' => Some(TokenKind::RBracket),
200            ',' => Some(TokenKind::Comma),
201            ':' => Some(TokenKind::Colon),
202            _ => None,
203        };
204        if let Some(kind) = single_kind {
205            pos += 1;
206            col += 1;
207            tokens.push(Token {
208                kind,
209                value: ch.to_string(),
210                line: sl,
211                col: sc,
212                is_quoted: false,
213                preceding_space: had_space,
214                preceding_whitespace: std::mem::take(&mut whitespace_buffer),
215                subst: None,
216            });
217            had_space = false;
218            continue;
219        }
220
221        // = and +=
222        if ch == '=' {
223            pos += 1;
224            col += 1;
225            tokens.push(Token {
226                kind: TokenKind::Equals,
227                value: "=".into(),
228                line: sl,
229                col: sc,
230                is_quoted: false,
231                preceding_space: had_space,
232                preceding_whitespace: std::mem::take(&mut whitespace_buffer),
233                subst: None,
234            });
235            had_space = false;
236            continue;
237        }
238        if ch == '+' && peek(pos, 1) == '=' {
239            pos += 2;
240            col += 2;
241            tokens.push(Token {
242                kind: TokenKind::PlusEquals,
243                value: "+=".into(),
244                line: sl,
245                col: sc,
246                is_quoted: false,
247                preceding_space: had_space,
248                preceding_whitespace: std::mem::take(&mut whitespace_buffer),
249                subst: None,
250            });
251            had_space = false;
252            continue;
253        }
254
255        // Substitution ${...} or ${?...}
256        if ch == '$' && peek(pos, 1) == '{' {
257            pos += 2;
258            col += 2;
259            let payload = parse_subst_body(&chars, &mut pos, &mut col, sl, sc)?;
260            // Reconstruct a canonical value string from segments.
261            // Segments that need quoting (contain dot, space, empty, etc.) are wrapped in "...".
262            let value = payload
263                .segments
264                .iter()
265                .map(|s| {
266                    let t = &s.text;
267                    if t.is_empty()
268                        || t.contains('.')
269                        || t.contains(' ')
270                        || t.contains('\t')
271                        || t.contains('"')
272                        || t.contains('\\')
273                        || t != t.trim()
274                    {
275                        let escaped = t.replace('\\', "\\\\").replace('"', "\\\"");
276                        format!("\"{}\"", escaped)
277                    } else {
278                        t.clone()
279                    }
280                })
281                .collect::<Vec<_>>()
282                .join(".");
283            tokens.push(Token {
284                kind: TokenKind::Substitution,
285                value,
286                line: sl,
287                col: sc,
288                is_quoted: false,
289                preceding_space: had_space,
290                preceding_whitespace: std::mem::take(&mut whitespace_buffer),
291                subst: Some(payload),
292            });
293            had_space = false;
294            continue;
295        }
296
297        // Triple-quoted string
298        if ch == '"' && peek(pos, 1) == '"' && peek(pos, 2) == '"' {
299            pos += 3;
300            col += 3;
301            let mut value = String::new();
302            let mut found_closing = false;
303            loop {
304                if pos >= chars.len() {
305                    break;
306                }
307                if chars[pos] == '"' {
308                    let mut quote_count = 0;
309                    while pos < chars.len() && chars[pos] == '"' {
310                        quote_count += 1;
311                        pos += 1;
312                        col += 1;
313                    }
314                    if quote_count >= 3 {
315                        for _ in 0..(quote_count - 3) {
316                            value.push('"');
317                        }
318                        found_closing = true;
319                        break;
320                    }
321                    for _ in 0..quote_count {
322                        value.push('"');
323                    }
324                    continue;
325                }
326                if chars[pos] == '\n' {
327                    line += 1;
328                    col = 1;
329                } else {
330                    col += 1;
331                }
332                value.push(chars[pos]);
333                pos += 1;
334            }
335            if !found_closing {
336                return Err(ParseError {
337                    message: "unterminated triple-quoted string".into(),
338                    line: sl,
339                    col: sc,
340                });
341            }
342            if value.starts_with('\n') {
343                value = value[1..].to_string();
344            }
345            tokens.push(Token {
346                kind: TokenKind::TripleQuotedString,
347                value,
348                line: sl,
349                col: sc,
350                is_quoted: true,
351                preceding_space: had_space,
352                preceding_whitespace: std::mem::take(&mut whitespace_buffer),
353                subst: None,
354            });
355            had_space = false;
356            continue;
357        }
358
359        // Quoted string
360        if ch == '"' {
361            pos += 1;
362            col += 1;
363            let value = read_quoted_body(&chars, &mut pos, &mut col, sl, sc)?;
364            tokens.push(Token {
365                kind: TokenKind::QuotedString,
366                value,
367                line: sl,
368                col: sc,
369                is_quoted: true,
370                preceding_space: had_space,
371                preceding_whitespace: std::mem::take(&mut whitespace_buffer),
372                subst: None,
373            });
374            had_space = false;
375            continue;
376        }
377
378        // Unquoted string
379        if is_unquoted_start(ch) {
380            // S8.6 / E8 (xx.hocon#31, xx.hocon#32 / commit dd102e8): the
381            // value-position read of HOCON.md L270-276 admits `-` even when
382            // not followed by a digit (bare `-` and `-foo` are unquoted
383            // strings, matching Lightbend's reference) and admits digit-
384            // leading runs (greedy: parse as number first, fall back to
385            // unquoted string when the run isn't a valid number — rs.hocon
386            // has no separate Number token kind, so this is realized at the
387            // parser/coerce layer in parse_scalar_value). The strict reject
388            // at this site was removed by the E8 amendment; concat-
389            // continuation cases like `${a}-bar` rely on the absence of
390            // that reject to extend the unquoted run after a value-token.
391            // Path-element strict checks live elsewhere — see
392            // parse_subst_body (this file) and parse_key (parser.rs).
393            let mut value = String::new();
394            while pos < chars.len() && is_unquoted_continue(chars[pos], || peek(pos, 1)) {
395                value.push(chars[pos]);
396                pos += 1;
397                col += 1;
398            }
399            let trimmed = value.trim_end().to_string();
400            tokens.push(Token {
401                kind: TokenKind::Unquoted,
402                value: trimmed,
403                line: sl,
404                col: sc,
405                is_quoted: false,
406                preceding_space: had_space,
407                preceding_whitespace: std::mem::take(&mut whitespace_buffer),
408                subst: None,
409            });
410            had_space = false;
411            continue;
412        }
413
414        return Err(ParseError {
415            message: format!("unexpected character: {:?}", ch),
416            line: sl,
417            col: sc,
418        });
419    }
420
421    tokens.push(Token {
422        kind: TokenKind::Eof,
423        value: String::new(),
424        line,
425        col,
426        is_quoted: false,
427        preceding_space: false,
428        preceding_whitespace: String::new(),
429        subst: None,
430    });
431    Ok(tokens)
432}
433
434/// Read the body of a quoted string (opening `"` already consumed).
435/// Returns the decoded string or a ParseError.
436/// `open_line`/`open_col` are the position of the opening `"` for error reporting.
437fn read_quoted_body(
438    chars: &[char],
439    pos: &mut usize,
440    col: &mut usize,
441    open_line: usize,
442    open_col: usize,
443) -> Result<String, ParseError> {
444    let mut value = String::new();
445    while *pos < chars.len() && chars[*pos] != '"' {
446        if chars[*pos] == '\n' {
447            return Err(ParseError {
448                message: "unterminated string".into(),
449                line: open_line,
450                col: open_col,
451            });
452        }
453        if chars[*pos] == '\\' {
454            let esc_col = *col;
455            *pos += 1;
456            *col += 1;
457            if *pos >= chars.len() {
458                return Err(ParseError {
459                    message: "unterminated string".into(),
460                    line: open_line,
461                    col: open_col,
462                });
463            }
464            let esc = chars[*pos];
465            *pos += 1;
466            *col += 1;
467            match esc {
468                'n' => value.push('\n'),
469                't' => value.push('\t'),
470                'r' => value.push('\r'),
471                '"' => value.push('"'),
472                '\\' => value.push('\\'),
473                '/' => value.push('/'),
474                'b' => value.push('\u{0008}'),
475                'f' => value.push('\u{000C}'),
476                'u' => {
477                    let hex: String = chars[*pos..].iter().take(4).collect();
478                    if hex.len() < 4 || !hex.chars().all(|c| c.is_ascii_hexdigit()) {
479                        return Err(ParseError {
480                            message: "invalid unicode escape".into(),
481                            line: open_line,
482                            col: esc_col,
483                        });
484                    }
485                    let code = u32::from_str_radix(&hex, 16).map_err(|_| ParseError {
486                        message: "invalid unicode escape".into(),
487                        line: open_line,
488                        col: esc_col,
489                    })?;
490                    let c = char::from_u32(code).ok_or_else(|| ParseError {
491                        message: "invalid unicode escape".into(),
492                        line: open_line,
493                        col: esc_col,
494                    })?;
495                    value.push(c);
496                    *pos += 4;
497                    *col += 4;
498                }
499                _ => {
500                    return Err(ParseError {
501                        message: "invalid escape sequence".into(),
502                        line: open_line,
503                        col: esc_col,
504                    });
505                }
506            }
507        } else {
508            value.push(chars[*pos]);
509            *pos += 1;
510            *col += 1;
511        }
512    }
513    if *pos >= chars.len() || chars[*pos] != '"' {
514        return Err(ParseError {
515            message: "unterminated string".into(),
516            line: open_line,
517            col: open_col,
518        });
519    }
520    *pos += 1;
521    *col += 1;
522    Ok(value)
523}
524
525/// Returns true if `ch` is a valid unquoted character inside a `${...}` body.
526/// Forbidden: any HOCON whitespace (full set per is_hocon_whitespace), `"`, `\`,
527///            `{`, `}`, `[`, `]`, `:`, `=`, `,`, `+`, `#`, `` ` ``, `^`, `?`,
528///            `!`, `@`, `*`, `&`, `$`, `.`.
529fn is_unquoted_subst_char(ch: char) -> bool {
530    if is_hocon_whitespace(ch) {
531        return false;
532    }
533    !matches!(
534        ch,
535        '"' | '\\'
536            | '{'
537            | '}'
538            | '['
539            | ']'
540            | ':'
541            | '='
542            | ','
543            | '+'
544            | '#'
545            | '`'
546            | '^'
547            | '?'
548            | '!'
549            | '@'
550            | '*'
551            | '&'
552            | '$'
553            | '.'
554    )
555}
556
557/// Consume the literal two-character sequence `[]` at the current position.
558///
559/// Called by `parse_subst_body` when the `[` arm fires. Expects `chars[*pos] == '['`
560/// on entry. Strict: no whitespace inside the brackets (`${X[ ]}` is a lex error).
561fn parse_literal_brackets(
562    chars: &[char],
563    pos: &mut usize,
564    col: &mut usize,
565    start_line: usize,
566) -> Result<(), ParseError> {
567    // Consume `[`.
568    debug_assert!(*pos < chars.len() && chars[*pos] == '[');
569    *pos += 1;
570    *col += 1;
571    // Next char must be `]` (no whitespace inside the brackets).
572    if *pos >= chars.len() || chars[*pos] != ']' {
573        let got = chars
574            .get(*pos)
575            .map(|c| c.escape_debug().to_string())
576            .unwrap_or_else(|| "EOF".into());
577        return Err(ParseError {
578            message: format!(
579                "expected ']' after '[' in substitution list suffix, got {}",
580                got
581            ),
582            line: start_line,
583            col: *col,
584        });
585    }
586    *pos += 1;
587    *col += 1;
588    Ok(())
589}
590
591/// Parse the body of a `${...}` substitution (called after `${` has been consumed).
592/// Returns the `SubstPayload` or a `ParseError`.
593fn parse_subst_body(
594    chars: &[char],
595    pos: &mut usize,
596    col: &mut usize,
597    start_line: usize,
598    start_col: usize,
599) -> Result<SubstPayload, ParseError> {
600    // Assumes `${` already consumed. Position is at char after `{`.
601
602    // START: check for optional sigil
603    let optional = if *pos < chars.len() && chars[*pos] == '?' {
604        *pos += 1;
605        *col += 1;
606        true
607    } else {
608        false
609    };
610
611    // COLLECT
612    // current segment state
613    let mut cur_text = String::new();
614    let mut cur_started = false;
615    let mut cur_line = 0usize;
616    let mut cur_col = 0usize;
617
618    let mut pending_ws = String::new();
619    let mut segments: Vec<Segment> = Vec::new();
620    // Track last-seen DOT position for trailing-dot error reporting.
621    let mut last_dot: Option<(usize, usize)> = None;
622    // Set to true when a `[]` suffix is encountered (S13c env-var-list).
623    let mut list_suffix = false;
624
625    loop {
626        if *pos >= chars.len() {
627            return Err(ParseError {
628                message: "unterminated substitution".into(),
629                line: start_line,
630                col: start_col,
631            });
632        }
633        let ch = chars[*pos];
634
635        match ch {
636            '}' => {
637                // END
638                *pos += 1;
639                *col += 1;
640                // Drop pending_ws (trailing whitespace)
641                pending_ws.clear();
642                break;
643            }
644            '"' => {
645                // QUOTED token
646                let q_line = start_line; // all on same conceptual line (no literal newlines allowed)
647                let q_col = *col;
648                if cur_started {
649                    cur_text.push_str(&pending_ws);
650                }
651                pending_ws.clear();
652                *pos += 1;
653                *col += 1;
654                let decoded = read_quoted_body(chars, pos, col, q_line, q_col)?;
655                cur_text.push_str(&decoded);
656                if !cur_started {
657                    cur_line = q_line;
658                    cur_col = q_col;
659                    cur_started = true;
660                }
661            }
662            ch if is_unquoted_subst_char(ch) => {
663                // S8.6 (HOCON.md L270–276) also applies to unquoted path
664                // segments inside ${...}: a segment beginning with '-' must be
665                // followed by a digit. Gate on `!cur_started` so the check
666                // fires only at **segment start** — a `-` that follows a
667                // quoted fragment in the same segment (e.g. `${"a"-foo}`
668                // resolving the key `"a-foo"` via quoted/unquoted concat) is
669                // not policed, mirroring how the existing `${"a"x}` flow
670                // builds `"ax"`. Digit-leading segments are not policed here
671                // either (consistent with the value-position rule and
672                // rs.hocon's unquoted-only token model — see
673                // docs/spec-compliance.md §S8.6).
674                if ch == '-' && !cur_started {
675                    let next = chars.get(*pos + 1).copied().unwrap_or('\0');
676                    if !next.is_ascii_digit() {
677                        let after = if next == '\0' {
678                            String::from("EOF")
679                        } else {
680                            format!("{:?}", next)
681                        };
682                        return Err(ParseError {
683                            message: format!(
684                                "unquoted path segment cannot begin with '-' unless followed by a digit (got '-' then {}, HOCON.md L270-276)",
685                                after
686                            ),
687                            line: start_line,
688                            col: *col,
689                        });
690                    }
691                }
692                // UNQUOTED token: read a run of unquoted chars
693                let uq_col = *col;
694                if cur_started {
695                    cur_text.push_str(&pending_ws);
696                }
697                pending_ws.clear();
698                if !cur_started {
699                    cur_line = start_line;
700                    cur_col = uq_col;
701                    cur_started = true;
702                }
703                while *pos < chars.len() && is_unquoted_subst_char(chars[*pos]) {
704                    cur_text.push(chars[*pos]);
705                    *pos += 1;
706                    *col += 1;
707                }
708            }
709            '.' => {
710                // DOT: flush current segment (or error if not started)
711                let dot_col = *col;
712                pending_ws.clear();
713                if !cur_started {
714                    return Err(ParseError {
715                        message: "empty segment in path".into(),
716                        line: start_line,
717                        col: dot_col,
718                    });
719                }
720                segments.push(Segment {
721                    text: std::mem::take(&mut cur_text),
722                    line: cur_line,
723                    col: cur_col,
724                });
725                cur_started = false;
726                cur_line = 0;
727                cur_col = 0;
728                last_dot = Some((start_line, dot_col));
729                *pos += 1;
730                *col += 1;
731            }
732            '[' => {
733                // S13c: `[]` suffix — end of path expression, start of list-suffix.
734                // Two convergent multi-impl checks (mirrors go.hocon + ts.hocon fixes):
735                //
736                //   (a) Empty-segment guard: error if no segment has been started AND
737                //       either there are no segments yet (`${[]}` / `${ []}`) or a
738                //       trailing dot was just consumed (`${X.[]}` / `${X . []}`).
739                //       Both reduce to `!cur_started` — uniform error.
740                //   (b) E7 narrow: pending_ws may contain only ASCII SPACE (0x20) or
741                //       TAB (0x09). Wider HOCON whitespace (NBSP, CR, Zs, BOM, …) is
742                //       accumulated by the broader inter-token WS arm below (S6 set)
743                //       but is rejected here for the `[` boundary per extra-spec E7
744                //       ("narrow allow-list intentionally avoids semantic surprise").
745                if !cur_started {
746                    return Err(ParseError {
747                        message: "empty segment before '[]' suffix in substitution".into(),
748                        line: start_line,
749                        col: *col,
750                    });
751                }
752                for w in pending_ws.chars() {
753                    if w != ' ' && w != '\t' {
754                        return Err(ParseError {
755                            message: format!(
756                                "only ASCII space or tab allowed between substitution path and '[]' suffix (got {:?}, HOCON extra-spec E7)",
757                                w
758                            ),
759                            line: start_line,
760                            col: *col,
761                        });
762                    }
763                }
764                // Flush in-progress unquoted segment (same as the `}` path).
765                segments.push(Segment {
766                    text: std::mem::take(&mut cur_text),
767                    line: cur_line,
768                    col: cur_col,
769                });
770                cur_started = false;
771                // E7-conformant pending_ws is intentionally discarded.
772                pending_ws.clear();
773                // Consume the literal `[]`.
774                parse_literal_brackets(chars, pos, col, start_line)?;
775                list_suffix = true;
776                // After `[]` the only legal next char is `}`.
777                if *pos >= chars.len() || chars[*pos] != '}' {
778                    return Err(ParseError {
779                        message: "expected '}' after '[]' in substitution".into(),
780                        line: start_line,
781                        col: *col,
782                    });
783                }
784                *pos += 1;
785                *col += 1;
786                break;
787            }
788            ch if is_hocon_whitespace(ch) && !is_hocon_newline(ch) => {
789                // Inter-token whitespace (full HOCON_WS minus LF): buffer into
790                // pending_ws; column advances but line is unchanged.
791                pending_ws.push(ch);
792                *pos += 1;
793                *col += 1;
794            }
795            '\n' => {
796                // LF inside ${...} is not allowed (unterminated substitution).
797                return Err(ParseError {
798                    message: "unterminated substitution".into(),
799                    line: start_line,
800                    col: start_col,
801                });
802            }
803            other => {
804                return Err(ParseError {
805                    message: format!(
806                        "unexpected character in substitution path: {}",
807                        other.escape_debug()
808                    ),
809                    line: start_line,
810                    col: *col,
811                });
812            }
813        }
814    }
815
816    // END validation (only reached via `}` break; `[]` break already pushes segment).
817    if cur_started {
818        segments.push(Segment {
819            text: cur_text,
820            line: cur_line,
821            col: cur_col,
822        });
823    } else if segments.is_empty() {
824        // ${}
825        return Err(ParseError {
826            message: "empty substitution path".into(),
827            line: start_line,
828            col: start_col,
829        });
830    } else if !list_suffix {
831        // trailing dot: ${foo.} — report at the offending dot position.
832        // Not an error when list_suffix=true; the `[]` arm already flushed.
833        let (err_line, err_col) = last_dot.unwrap_or((start_line, start_col));
834        return Err(ParseError {
835            message: "empty segment in path".into(),
836            line: err_line,
837            col: err_col,
838        });
839    }
840
841    Ok(SubstPayload {
842        segments,
843        optional,
844        list_suffix,
845    })
846}
847
848fn is_unquoted_start(ch: char) -> bool {
849    if is_hocon_whitespace(ch) {
850        return false;
851    }
852    !matches!(
853        ch,
854        '{' | '}'
855            | '['
856            | ']'
857            | ','
858            | ':'
859            | '='
860            | '+'
861            | '#'
862            | '"'
863            | '$'
864            | '?'
865            | '!'
866            | '@'
867            | '*'
868            | '&'
869            | '^'
870            | '\\'
871    )
872}
873
874fn is_unquoted_continue(ch: char, next_fn: impl Fn() -> char) -> bool {
875    if is_hocon_whitespace(ch) {
876        return false;
877    }
878    if matches!(
879        ch,
880        '{' | '}'
881            | '['
882            | ']'
883            | ','
884            | ':'
885            | '='
886            | '#'
887            | '"'
888            | '$'
889            | '?'
890            | '!'
891            | '@'
892            | '*'
893            | '&'
894            | '^'
895            | '\\'
896    ) {
897        return false;
898    }
899    if ch == '+' && next_fn() == '=' {
900        return false;
901    }
902    if ch == '/' && next_fn() == '/' {
903        return false;
904    }
905    true
906}
907
908#[cfg(test)]
909mod tests {
910    use super::*;
911
912    fn kinds(input: &str) -> Vec<TokenKind> {
913        tokenize(input)
914            .unwrap()
915            .iter()
916            .map(|t| t.kind.clone())
917            .collect()
918    }
919
920    fn first(input: &str) -> Token {
921        tokenize(input).unwrap().into_iter().next().unwrap()
922    }
923
924    #[test]
925    fn tokenizes_empty_string() {
926        let tokens = tokenize("").unwrap();
927        assert_eq!(tokens.len(), 1);
928        assert_eq!(tokens[0].kind, TokenKind::Eof);
929    }
930
931    #[test]
932    fn tokenizes_braces_and_brackets() {
933        assert_eq!(
934            kinds("{}[]"),
935            vec![
936                TokenKind::LBrace,
937                TokenKind::RBrace,
938                TokenKind::LBracket,
939                TokenKind::RBracket,
940                TokenKind::Eof
941            ]
942        );
943    }
944
945    #[test]
946    fn tokenizes_equals_and_plus_equals() {
947        let tokens = tokenize("=+=").unwrap();
948        assert_eq!(tokens[0].kind, TokenKind::Equals);
949        assert_eq!(tokens[1].kind, TokenKind::PlusEquals);
950    }
951
952    #[test]
953    fn tokenizes_colon_and_comma() {
954        assert_eq!(
955            kinds(":,"),
956            vec![TokenKind::Colon, TokenKind::Comma, TokenKind::Eof]
957        );
958    }
959
960    #[test]
961    fn skips_slash_comments_keeps_newline() {
962        let tokens = tokenize("// comment\nfoo").unwrap();
963        assert_eq!(tokens[0].kind, TokenKind::Newline);
964        assert_eq!(tokens[1].kind, TokenKind::Unquoted);
965        assert_eq!(tokens[1].value, "foo");
966    }
967
968    #[test]
969    fn skips_hash_comments() {
970        let tokens = tokenize("# comment\nfoo").unwrap();
971        assert_eq!(tokens[0].kind, TokenKind::Newline);
972        assert_eq!(tokens[1].value, "foo");
973    }
974
975    #[test]
976    fn tokenizes_quoted_strings() {
977        let t = first("\"hello world\"");
978        assert_eq!(t.kind, TokenKind::QuotedString);
979        assert_eq!(t.value, "hello world");
980        assert!(t.is_quoted);
981    }
982
983    #[test]
984    fn handles_escape_sequences() {
985        let t = first("\"a\\nb\\tc\"");
986        assert_eq!(t.value, "a\nb\tc");
987    }
988
989    #[test]
990    fn handles_unicode_escapes() {
991        let t = first("\"\\u0041\"");
992        assert_eq!(t.value, "A");
993    }
994
995    #[test]
996    fn tokenizes_triple_quoted_strings() {
997        let t = first("\"\"\"hello\nworld\"\"\"");
998        assert_eq!(t.kind, TokenKind::TripleQuotedString);
999        assert_eq!(t.value, "hello\nworld");
1000        assert!(t.is_quoted);
1001    }
1002
1003    #[test]
1004    fn strips_leading_newline_from_triple_quoted() {
1005        let t = first("\"\"\"\nhello\"\"\"");
1006        assert_eq!(t.value, "hello");
1007    }
1008
1009    #[test]
1010    fn tokenizes_unquoted_strings() {
1011        let t = first("localhost");
1012        assert_eq!(t.kind, TokenKind::Unquoted);
1013        assert_eq!(t.value, "localhost");
1014        assert!(!t.is_quoted);
1015    }
1016
1017    #[test]
1018    fn tokenizes_numbers_as_unquoted() {
1019        let t = first("8080");
1020        assert_eq!(t.kind, TokenKind::Unquoted);
1021        assert_eq!(t.value, "8080");
1022    }
1023
1024    #[test]
1025    fn tokenizes_substitutions() {
1026        let t = first("${server.host}");
1027        assert_eq!(t.kind, TokenKind::Substitution);
1028        assert_eq!(t.value, "server.host");
1029    }
1030
1031    #[test]
1032    fn tokenizes_optional_substitutions() {
1033        let t = first("${?foo}");
1034        assert_eq!(t.kind, TokenKind::Substitution);
1035        assert_eq!(t.value, "foo");
1036        assert!(t.subst.as_ref().unwrap().optional);
1037    }
1038
1039    #[test]
1040    fn tokenizes_newlines() {
1041        let tokens = tokenize("a\nb").unwrap();
1042        assert_eq!(tokens[1].kind, TokenKind::Newline);
1043    }
1044
1045    #[test]
1046    fn deduplicates_consecutive_newlines() {
1047        let tokens = tokenize("a\n\n\nb").unwrap();
1048        let newlines: Vec<_> = tokens
1049            .iter()
1050            .filter(|t| t.kind == TokenKind::Newline)
1051            .collect();
1052        assert_eq!(newlines.len(), 1);
1053    }
1054
1055    #[test]
1056    fn tracks_line_and_col() {
1057        let tokens = tokenize("a\nb").unwrap();
1058        assert_eq!(tokens[0].line, 1);
1059        assert_eq!(tokens[0].col, 1);
1060        assert_eq!(tokens[2].line, 2);
1061        assert_eq!(tokens[2].col, 1);
1062    }
1063
1064    #[test]
1065    fn sets_preceding_space() {
1066        let tokens = tokenize("a b").unwrap();
1067        assert!(tokens[1].preceding_space);
1068        assert!(!tokens[0].preceding_space);
1069    }
1070
1071    #[test]
1072    fn strips_utf8_bom() {
1073        let tokens = tokenize("\u{FEFF}foo").unwrap();
1074        assert_eq!(tokens[0].value, "foo");
1075    }
1076
1077    #[test]
1078    fn stops_unquoted_at_dollar_for_concat() {
1079        let tokens = tokenize("foo${bar}").unwrap();
1080        assert_eq!(tokens[0].kind, TokenKind::Unquoted);
1081        assert_eq!(tokens[0].value, "foo");
1082        assert_eq!(tokens[1].kind, TokenKind::Substitution);
1083        assert_eq!(tokens[1].value, "bar");
1084        assert!(!tokens[1].preceding_space);
1085    }
1086
1087    #[test]
1088    fn throws_on_unterminated_string() {
1089        assert!(tokenize("\"unterminated").is_err());
1090    }
1091
1092    #[test]
1093    fn throws_on_unterminated_substitution() {
1094        assert!(tokenize("${foo").is_err());
1095    }
1096
1097    #[test]
1098    fn throws_on_unterminated_triple_quoted_string() {
1099        assert!(tokenize(r#""""unterminated"#).is_err());
1100    }
1101
1102    // -------------------------------------------------------------------------
1103    // Spec compliance Phase 1 (issue #60): lexer-level rules.
1104    //
1105    // Each test is annotated with its xx.hocon spec checklist ID (S<n>.<m>).
1106    //
1107    // Convention for known spec violations:
1108    //   - The spec-correct test is annotated with #[ignore = "spec violation, see #NN"].
1109    //     CI stays green while the impl is buggy; removing the attribute once a fix
1110    //     lands flips the test to required-pass.
1111    //   - Where the ambiguity of it.fails()-equivalent is high (e.g., S6.x where
1112    //     a "fix" could plausibly reject or accept), a companion `_pin` test (no
1113    //     #[ignore]) asserts the *current* broken behavior as a regression net.
1114    // -------------------------------------------------------------------------
1115
1116    // --- S2.3: comment markers inside quoted strings are literal -------------
1117    // Spec L126: "//" and "#" inside double-quoted strings must NOT be treated as
1118    // comment starters — they are literal string content.
1119    #[test]
1120    fn s2_3_comment_markers_inside_quoted_string_are_literal() {
1121        // "http://example.com" — the "//" must not start a comment
1122        let tokens = tokenize(r#""http://example.com""#).unwrap();
1123        assert_eq!(tokens[0].kind, TokenKind::QuotedString);
1124        assert_eq!(tokens[0].value, "http://example.com");
1125
1126        // "# not a comment" — the "#" must not start a comment
1127        let tokens = tokenize("\"# not a comment\"").unwrap();
1128        assert_eq!(tokens[0].kind, TokenKind::QuotedString);
1129        assert_eq!(tokens[0].value, "# not a comment");
1130    }
1131
1132    // --- S6.1: Unicode Zs / Zl / Zp category chars are whitespace -----------
1133    // Spec L170: the lexer must treat any Unicode whitespace category character
1134    // (Zs, Zl, Zp) as a token separator, not as unquoted string content.
1135    // All Zs/Zl/Zp members are covered by is_hocon_whitespace.
1136    //
1137    // Spec-correct test: em space must separate two unquoted tokens.
1138    #[test]
1139    fn s6_1_em_space_separates_tokens_spec() {
1140        let tokens = tokenize("a\u{2003}b").unwrap();
1141        let unquoted: Vec<_> = tokens
1142            .iter()
1143            .filter(|t| t.kind == TokenKind::Unquoted)
1144            .collect();
1145        assert_eq!(unquoted.len(), 2, "em space should separate two tokens");
1146        assert_eq!(unquoted[0].value, "a");
1147        assert_eq!(unquoted[1].value, "b");
1148    }
1149
1150    // Spec-correct test: line separator (U+2028, Zl) must be whitespace.
1151    #[test]
1152    fn s6_1_line_separator_separates_tokens_spec() {
1153        let tokens = tokenize("a\u{2028}b").unwrap();
1154        let unquoted: Vec<_> = tokens
1155            .iter()
1156            .filter(|t| t.kind == TokenKind::Unquoted)
1157            .collect();
1158        assert_eq!(unquoted.len(), 2, "U+2028 (Zl) should separate two tokens");
1159        assert_eq!(unquoted[0].value, "a");
1160        assert_eq!(unquoted[1].value, "b");
1161    }
1162
1163    // --- S6.2: non-breaking spaces are whitespace ----------------------------
1164    // Spec L171: U+00A0 (NBSP), U+2007 (figure space), U+202F (narrow NBSP)
1165    // must be treated as whitespace. All three are in is_hocon_whitespace.
1166
1167    // Spec-correct test: NBSP (U+00A0) must separate tokens.
1168    #[test]
1169    fn s6_2_nbsp_separates_tokens_spec() {
1170        let tokens = tokenize("a\u{00A0}b").unwrap();
1171        let unquoted: Vec<_> = tokens
1172            .iter()
1173            .filter(|t| t.kind == TokenKind::Unquoted)
1174            .collect();
1175        assert_eq!(unquoted.len(), 2, "NBSP should separate two tokens");
1176        assert_eq!(unquoted[0].value, "a");
1177        assert_eq!(unquoted[1].value, "b");
1178    }
1179
1180    // Spec-correct test: figure space (U+2007) must separate tokens.
1181    #[test]
1182    fn s6_2_figure_space_separates_tokens_spec() {
1183        let tokens = tokenize("a\u{2007}b").unwrap();
1184        let unquoted: Vec<_> = tokens
1185            .iter()
1186            .filter(|t| t.kind == TokenKind::Unquoted)
1187            .collect();
1188        assert_eq!(unquoted.len(), 2, "figure space should separate two tokens");
1189        assert_eq!(unquoted[0].value, "a");
1190        assert_eq!(unquoted[1].value, "b");
1191    }
1192
1193    // Spec-correct test: narrow NBSP (U+202F) must separate tokens.
1194    #[test]
1195    fn s6_2_narrow_nbsp_separates_tokens_spec() {
1196        let tokens = tokenize("a\u{202F}b").unwrap();
1197        let unquoted: Vec<_> = tokens
1198            .iter()
1199            .filter(|t| t.kind == TokenKind::Unquoted)
1200            .collect();
1201        assert_eq!(unquoted.len(), 2, "narrow NBSP should separate two tokens");
1202        assert_eq!(unquoted[0].value, "a");
1203        assert_eq!(unquoted[1].value, "b");
1204    }
1205
1206    // --- S6.4: ASCII control whitespace --------------------------------------
1207    // Spec L174 lists 8 chars that are whitespace: tab (0x09), vtab (0x0B),
1208    // FF (0x0C), CR (0x0D), FS (0x1C), GS (0x1D), RS (0x1E), US (0x1F).
1209    // All 8 are now covered by is_hocon_whitespace.
1210
1211    #[test]
1212    fn s6_4_tab_is_whitespace() {
1213        // Tab (0x09): in the HOCON whitespace set.
1214        let tokens = tokenize("a\tb").unwrap();
1215        let unquoted: Vec<_> = tokens
1216            .iter()
1217            .filter(|t| t.kind == TokenKind::Unquoted)
1218            .collect();
1219        assert_eq!(unquoted.len(), 2);
1220        assert_eq!(unquoted[0].value, "a");
1221        assert_eq!(unquoted[1].value, "b");
1222    }
1223
1224    #[test]
1225    fn s6_4_cr_is_whitespace() {
1226        // CR (0x0D): in the HOCON whitespace set.
1227        // CR alone (without LF) acts as inter-token whitespace, not a newline emitter.
1228        let tokens = tokenize("a\rb").unwrap();
1229        let unquoted: Vec<_> = tokens
1230            .iter()
1231            .filter(|t| t.kind == TokenKind::Unquoted)
1232            .collect();
1233        assert_eq!(unquoted.len(), 2);
1234        assert_eq!(unquoted[0].value, "a");
1235        assert_eq!(unquoted[1].value, "b");
1236    }
1237
1238    // Spec-correct test: vtab (0x0B) must be whitespace.
1239    #[test]
1240    fn s6_4_vtab_is_whitespace_spec() {
1241        let tokens = tokenize("a\x0Bb").unwrap();
1242        let unquoted: Vec<_> = tokens
1243            .iter()
1244            .filter(|t| t.kind == TokenKind::Unquoted)
1245            .collect();
1246        assert_eq!(unquoted.len(), 2, "vtab should separate tokens");
1247        assert_eq!(unquoted[0].value, "a");
1248        assert_eq!(unquoted[1].value, "b");
1249    }
1250
1251    // Spec-correct test: form feed (0x0C) must be whitespace.
1252    #[test]
1253    fn s6_4_ff_is_whitespace_spec() {
1254        let tokens = tokenize("a\x0Cb").unwrap();
1255        let unquoted: Vec<_> = tokens
1256            .iter()
1257            .filter(|t| t.kind == TokenKind::Unquoted)
1258            .collect();
1259        assert_eq!(unquoted.len(), 2, "FF should separate tokens");
1260        assert_eq!(unquoted[0].value, "a");
1261        assert_eq!(unquoted[1].value, "b");
1262    }
1263
1264    // Spec-correct test: FS, GS, RS, US (0x1C–0x1F) must be whitespace.
1265    // These are grouped because they share the same root cause (not in the
1266    // lexer's whitespace check) and the same fix will address all four.
1267    #[test]
1268    fn s6_4_fs_gs_rs_us_are_whitespace_spec() {
1269        for (label, ch) in [
1270            ("FS (0x1C)", '\x1C'),
1271            ("GS (0x1D)", '\x1D'),
1272            ("RS (0x1E)", '\x1E'),
1273            ("US (0x1F)", '\x1F'),
1274        ] {
1275            let input = format!("a{}b", ch);
1276            let tokens = tokenize(&input).unwrap();
1277            let unquoted: Vec<_> = tokens
1278                .iter()
1279                .filter(|t| t.kind == TokenKind::Unquoted)
1280                .collect();
1281            assert_eq!(unquoted.len(), 2, "{label} should separate tokens");
1282            assert_eq!(unquoted[0].value, "a", "{label}");
1283            assert_eq!(unquoted[1].value, "b", "{label}");
1284        }
1285    }
1286
1287    // --- LF regression guard: LF must still emit Newline token ---------------
1288    // After predicate centralization, is_hocon_whitespace returns true for LF.
1289    // The newline branch must check BEFORE the whitespace skip so LF still
1290    // produces TokenKind::Newline (per spec §D, design invariant).
1291    #[test]
1292    fn s6_lf_still_emits_newline_token() {
1293        let tokens = tokenize("a\nb").unwrap();
1294        assert!(
1295            tokens.iter().any(|t| matches!(t.kind, TokenKind::Newline)),
1296            "LF must still emit a Newline token after whitespace predicate centralization"
1297        );
1298    }
1299
1300    // --- S6.3 (broadened): BOM mid-stream is whitespace ----------------------
1301    // Spec L173: BOM (U+FEFF) is whitespace, not a start-of-input marker.
1302    // The lexer still strips BOM at char index 0 (harmless redundancy), and
1303    // BOM mid-stream is now consumed as inter-token whitespace via
1304    // is_hocon_whitespace.
1305    //
1306    // Spec-correct test: BOM mid-stream must separate two unquoted tokens.
1307    #[test]
1308    fn s6_3_bom_midstream_is_whitespace() {
1309        let tokens = tokenize("a\u{FEFF}b").unwrap();
1310        let unquoted: Vec<_> = tokens
1311            .iter()
1312            .filter(|t| t.kind == TokenKind::Unquoted)
1313            .collect();
1314        assert_eq!(
1315            unquoted.len(),
1316            2,
1317            "BOM mid-stream should separate two tokens"
1318        );
1319        assert_eq!(unquoted[0].value, "a");
1320        assert_eq!(unquoted[1].value, "b");
1321    }
1322
1323    // --- S8.6 / E8: unquoted string begin rules (post-E8 amendment) ---------
1324    //
1325    // E8 amendment (xx.hocon#31 / commit dd102e8) reads HOCON.md L270-276
1326    // "begin" as value-position begin (first component of a concatenation),
1327    // not token-position begin at any lexer offset. At value-start:
1328    //   - the lexer reads the entire run as a single unquoted token (no
1329    //     separate number token kind); numeric coercion happens later in
1330    //     parse_scalar_value. Tokens that don't parse as numbers (e.g.
1331    //     `123abc`) stay as strings.
1332    //   - `-` not followed by a digit is treated as the start of an unquoted
1333    //     run (the strict reject at the lexer was removed per E8).
1334    // Path-element rules (substitution body, dotted key segments) remain
1335    // strict — covered in tests/s8_unquoted_starts.rs.
1336
1337    #[test]
1338    fn e8_value_start_digit_leading_with_letters_is_string() {
1339        // `123abc` is not a valid number; parse_scalar_value falls back to
1340        // ScalarType::String. Same observable behavior as Lightbend (whose
1341        // parseLong/parseFloat both fail and produce an unquoted concat).
1342        // Assert the resolved value (not just is_ok) so accidental coercion
1343        // or truncation would surface here.
1344        let cfg = crate::parse("x = 123abc").expect("parse failed");
1345        assert_eq!(
1346            cfg.get_string("x").expect("x not found"),
1347            "123abc",
1348            "E8: `123abc` must lex+resolve as unquoted string \"123abc\""
1349        );
1350    }
1351
1352    #[test]
1353    fn e8_value_start_hyphen_leading_non_number_is_string() {
1354        // Pre-E8 this was a lex error (S8.6 strict reading). Post-E8, `-foo`
1355        // is an unquoted string at value-position — RFC 8259 JSON-number
1356        // requires a digit after `-`, so bare `-foo` falls outside L270's
1357        // disallow scope. Lightbend reference produces `{"x":"-foo"}`.
1358        // Assert the resolved value (not just is_ok) so accidental coercion
1359        // or truncation would surface here.
1360        let cfg = crate::parse("x = -foo").expect("parse failed");
1361        assert_eq!(
1362            cfg.get_string("x").expect("x not found"),
1363            "-foo",
1364            "E8: `-foo` must lex+resolve as unquoted string \"-foo\""
1365        );
1366    }
1367
1368    // --- S8.7: no escape sequences in unquoted strings -----------------------
1369    // Spec L253: unquoted strings do not interpret any escape sequences.
1370    // A backslash inside an unquoted run is forbidden (it terminates the run
1371    // in rs.hocon because '\' is excluded from is_unquoted_start and
1372    // is_unquoted_continue), and the bare backslash produces a lexer error.
1373    #[test]
1374    fn s8_7_backslash_is_rejected_in_unquoted_context() {
1375        // "a\n" outside quotes: the lexer reads 'a' as unquoted, then hits '\',
1376        // which is not a valid unquoted character and not a recognised token
1377        // introducer — the lexer should error.
1378        assert!(
1379            tokenize(r"a\n").is_err(),
1380            "bare backslash outside quotes must be rejected"
1381        );
1382    }
1383
1384    // --- S8.8: unquoted strings allow control chars except forbidden set -----
1385    // Spec L280: control characters OTHER than the forbidden set (L245:
1386    // $ " { } [ ] : = , + # ` ^ ? ! @ * & \ and whitespace are permitted
1387    // inside unquoted strings.
1388    #[test]
1389    fn s8_8_soh_allowed_in_unquoted_string() {
1390        // SOH (0x01) is a control character not in the forbidden set.
1391        let tokens = tokenize("foo\x01bar").unwrap();
1392        let unquoted: Vec<_> = tokens
1393            .iter()
1394            .filter(|t| t.kind == TokenKind::Unquoted)
1395            .collect();
1396        assert_eq!(unquoted.len(), 1);
1397        assert_eq!(unquoted[0].value, "foo\x01bar");
1398    }
1399
1400    #[test]
1401    fn s8_8_bel_allowed_in_unquoted_string() {
1402        // BEL (0x07) is a control character not in the forbidden set.
1403        let tokens = tokenize("foo\x07bar").unwrap();
1404        let unquoted: Vec<_> = tokens
1405            .iter()
1406            .filter(|t| t.kind == TokenKind::Unquoted)
1407            .collect();
1408        assert_eq!(unquoted.len(), 1);
1409        assert_eq!(unquoted[0].value, "foo\x07bar");
1410    }
1411}