hocon-parser 1.6.0

Full Lightbend HOCON specification-compliant parser for Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
use crate::error::ParseError;

#[derive(Debug, Clone, PartialEq)]
pub enum TokenKind {
    LBrace,
    RBrace,
    LBracket,
    RBracket,
    Comma,
    Colon,
    Equals,
    PlusEquals,
    Newline,
    QuotedString,
    TripleQuotedString,
    Unquoted,
    Substitution,
    Eof,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Segment {
    pub text: String,
    pub line: usize,
    pub col: usize,
}

/// Payload carried by a `${...}` or `${?...}` substitution token.
///
/// `#[non_exhaustive]` ensures that adding new fields here (e.g. future spec
/// extensions) does not break downstream crates that pattern-match or
/// construct this struct.
#[non_exhaustive]
#[derive(Debug, Clone)]
pub struct SubstPayload {
    pub segments: Vec<Segment>,
    pub optional: bool,
    /// True when the substitution body carries a `[]` suffix, signalling
    /// env-var-list expansion (`${X[]}` / `${?X[]}`).
    pub list_suffix: bool,
}

/// A single token produced by the lexer.
///
/// `Token` is publicly re-exported as `hocon::Token` for the narrow surface
/// that integration tests and diagnostic tooling need (per the advisory in
/// `lib.rs`). It is marked `#[non_exhaustive]`: downstream code MUST NOT
/// construct `Token` via struct-literal syntax and should treat it as
/// inspect-only. This frees the lexer to add new metadata fields (e.g.
/// `preceding_whitespace` in v1.5.3) without further source breaks.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct Token {
    pub kind: TokenKind,
    pub value: String,
    pub line: usize,
    pub col: usize,
    #[allow(dead_code)]
    pub is_quoted: bool,
    /// True if preceded by whitespace OR a comment (concat detection, S10.5 / S10.8).
    pub preceding_space: bool,
    /// Literal preceding-whitespace chars consumed since the previous token.
    /// Used by `parse_key` to preserve path-expression whitespace per E13 — for
    /// `a b. c = 1` the ' ' before `c` becomes a leading-space prefix on the
    /// post-dot segment.
    ///
    /// Note: `preceding_space` may be true while `preceding_whitespace` is empty
    /// when the token is preceded only by a comment (no literal WS chars). The
    /// boolean is the right signal for concat detection; the string is the right
    /// signal for path-WS preservation. The comment-only shape fires for the
    /// `newline` token emitted after `// foo\n` / `# foo\n`; non-newline tokens
    /// participating in concat / path-WS contexts are always either preceded by
    /// literal WS chars OR follow a newline that resets the buffer.
    pub preceding_whitespace: String,
    pub subst: Option<SubstPayload>,
}

/// Returns true for every character in the HOCON whitespace set.
///
/// The set is defined by Lightbend HOCON.md §Whitespace (L165-184) as:
///   Java Character.isWhitespace set
///   ∪ { U+00A0, U+2007, U+202F }  (NBSP variants Java excludes)
///   ∪ { U+FEFF }                  (BOM)
///
/// Expanded:
///   ASCII:  0x09 (TAB), 0x0A (LF), 0x0B (VTAB), 0x0C (FF), 0x0D (CR),
///           0x1C (FS), 0x1D (GS), 0x1E (RS), 0x1F (US)
///   Zs:     0x20, 0x00A0, 0x1680, 0x2000-0x200A, 0x202F, 0x205F, 0x3000
///   Zl:     0x2028
///   Zp:     0x2029
///   BOM:    0xFEFF
///
/// NOTE: U+000A (LF) is included here because it is in the Java
/// Character.isWhitespace set.  Callers that need to distinguish newline from
/// inter-token whitespace must call is_hocon_newline first.
pub(crate) fn is_hocon_whitespace(ch: char) -> bool {
    matches!(ch,
        '\t' | '\n' | '\u{000B}' | '\u{000C}' | '\r'
      | '\u{001C}'..='\u{001F}'
      | ' ' | '\u{00A0}' | '\u{FEFF}'
      | '\u{1680}'
      | '\u{2000}'..='\u{200A}'
      | '\u{2028}' | '\u{2029}' | '\u{202F}' | '\u{205F}'
      | '\u{3000}'
    )
}

/// Returns true if `ch` is the HOCON newline character (ASCII LF, U+000A only).
///
/// Per HOCON.md L182-184: "newline refers only and specifically to ASCII
/// newline 0x000A".  Unicode line/paragraph separators (U+2028, U+2029) are
/// whitespace but NOT newlines.
fn is_hocon_newline(ch: char) -> bool {
    ch == '\n'
}

pub fn tokenize(input: &str) -> Result<Vec<Token>, ParseError> {
    let chars: Vec<char> = input.chars().collect();
    let mut tokens = Vec::new();
    let mut pos = 0usize;
    let mut line = 1usize;
    let mut col = 1usize;
    let mut had_space = false;
    // E13 — accumulates literal whitespace chars consumed between tokens.
    // Drained (via std::mem::take) on every token push. Comment text is NOT
    // accumulated; only the actual WS chars.
    let mut whitespace_buffer = String::new();

    // Strip UTF-8 BOM
    if !chars.is_empty() && chars[0] == '\u{FEFF}' {
        pos = 1;
    }

    let peek =
        |pos: usize, offset: usize| -> char { chars.get(pos + offset).copied().unwrap_or('\0') };

    while pos < chars.len() {
        let sl = line;
        let sc = col;
        let ch = chars[pos];

        // Newline (must be checked before general whitespace because
        // is_hocon_whitespace also returns true for LF — see spec §D).
        if is_hocon_newline(ch) {
            pos += 1;
            line += 1;
            col = 1;
            if tokens
                .last()
                .is_none_or(|t: &Token| t.kind != TokenKind::Newline)
            {
                tokens.push(Token {
                    kind: TokenKind::Newline,
                    value: "\n".into(),
                    line: sl,
                    col: sc,
                    is_quoted: false,
                    preceding_space: had_space,
                    preceding_whitespace: std::mem::take(&mut whitespace_buffer),
                    subst: None,
                });
                had_space = false;
            }
            continue;
        }

        // Whitespace (not newline) — full HOCON_WS set per spec L165-184.
        if is_hocon_whitespace(ch) {
            whitespace_buffer.push(ch);
            pos += 1;
            col += 1;
            had_space = true;
            continue;
        }

        // Comments
        if ch == '/' && peek(pos, 1) == '/' {
            while pos < chars.len() && chars[pos] != '\n' {
                pos += 1;
                col += 1;
            }
            had_space = true;
            continue;
        }
        if ch == '#' {
            while pos < chars.len() && chars[pos] != '\n' {
                pos += 1;
                col += 1;
            }
            had_space = true;
            continue;
        }

        // Single-char punctuation
        let single_kind = match ch {
            '{' => Some(TokenKind::LBrace),
            '}' => Some(TokenKind::RBrace),
            '[' => Some(TokenKind::LBracket),
            ']' => Some(TokenKind::RBracket),
            ',' => Some(TokenKind::Comma),
            ':' => Some(TokenKind::Colon),
            _ => None,
        };
        if let Some(kind) = single_kind {
            pos += 1;
            col += 1;
            tokens.push(Token {
                kind,
                value: ch.to_string(),
                line: sl,
                col: sc,
                is_quoted: false,
                preceding_space: had_space,
                preceding_whitespace: std::mem::take(&mut whitespace_buffer),
                subst: None,
            });
            had_space = false;
            continue;
        }

        // = and +=
        if ch == '=' {
            pos += 1;
            col += 1;
            tokens.push(Token {
                kind: TokenKind::Equals,
                value: "=".into(),
                line: sl,
                col: sc,
                is_quoted: false,
                preceding_space: had_space,
                preceding_whitespace: std::mem::take(&mut whitespace_buffer),
                subst: None,
            });
            had_space = false;
            continue;
        }
        if ch == '+' && peek(pos, 1) == '=' {
            pos += 2;
            col += 2;
            tokens.push(Token {
                kind: TokenKind::PlusEquals,
                value: "+=".into(),
                line: sl,
                col: sc,
                is_quoted: false,
                preceding_space: had_space,
                preceding_whitespace: std::mem::take(&mut whitespace_buffer),
                subst: None,
            });
            had_space = false;
            continue;
        }

        // Substitution ${...} or ${?...}
        if ch == '$' && peek(pos, 1) == '{' {
            pos += 2;
            col += 2;
            let payload = parse_subst_body(&chars, &mut pos, &mut col, sl, sc)?;
            // Reconstruct a canonical value string from segments.
            // Segments that need quoting (contain dot, space, empty, etc.) are wrapped in "...".
            let value = payload
                .segments
                .iter()
                .map(|s| {
                    let t = &s.text;
                    if t.is_empty()
                        || t.contains('.')
                        || t.contains(' ')
                        || t.contains('\t')
                        || t.contains('"')
                        || t.contains('\\')
                        || t != t.trim()
                    {
                        let escaped = t.replace('\\', "\\\\").replace('"', "\\\"");
                        format!("\"{}\"", escaped)
                    } else {
                        t.clone()
                    }
                })
                .collect::<Vec<_>>()
                .join(".");
            tokens.push(Token {
                kind: TokenKind::Substitution,
                value,
                line: sl,
                col: sc,
                is_quoted: false,
                preceding_space: had_space,
                preceding_whitespace: std::mem::take(&mut whitespace_buffer),
                subst: Some(payload),
            });
            had_space = false;
            continue;
        }

        // Triple-quoted string
        if ch == '"' && peek(pos, 1) == '"' && peek(pos, 2) == '"' {
            pos += 3;
            col += 3;
            let mut value = String::new();
            let mut found_closing = false;
            loop {
                if pos >= chars.len() {
                    break;
                }
                if chars[pos] == '"' {
                    let mut quote_count = 0;
                    while pos < chars.len() && chars[pos] == '"' {
                        quote_count += 1;
                        pos += 1;
                        col += 1;
                    }
                    if quote_count >= 3 {
                        for _ in 0..(quote_count - 3) {
                            value.push('"');
                        }
                        found_closing = true;
                        break;
                    }
                    for _ in 0..quote_count {
                        value.push('"');
                    }
                    continue;
                }
                if chars[pos] == '\n' {
                    line += 1;
                    col = 1;
                } else {
                    col += 1;
                }
                value.push(chars[pos]);
                pos += 1;
            }
            if !found_closing {
                return Err(ParseError {
                    message: "unterminated triple-quoted string".into(),
                    line: sl,
                    col: sc,
                });
            }
            if value.starts_with('\n') {
                value = value[1..].to_string();
            }
            tokens.push(Token {
                kind: TokenKind::TripleQuotedString,
                value,
                line: sl,
                col: sc,
                is_quoted: true,
                preceding_space: had_space,
                preceding_whitespace: std::mem::take(&mut whitespace_buffer),
                subst: None,
            });
            had_space = false;
            continue;
        }

        // Quoted string
        if ch == '"' {
            pos += 1;
            col += 1;
            let value = read_quoted_body(&chars, &mut pos, &mut col, sl, sc)?;
            tokens.push(Token {
                kind: TokenKind::QuotedString,
                value,
                line: sl,
                col: sc,
                is_quoted: true,
                preceding_space: had_space,
                preceding_whitespace: std::mem::take(&mut whitespace_buffer),
                subst: None,
            });
            had_space = false;
            continue;
        }

        // Unquoted string
        if is_unquoted_start(ch) {
            // S8.6 / E8 (xx.hocon#31, xx.hocon#32 / commit dd102e8): the
            // value-position read of HOCON.md L270-276 admits `-` even when
            // not followed by a digit (bare `-` and `-foo` are unquoted
            // strings, matching Lightbend's reference) and admits digit-
            // leading runs (greedy: parse as number first, fall back to
            // unquoted string when the run isn't a valid number — rs.hocon
            // has no separate Number token kind, so this is realized at the
            // parser/coerce layer in parse_scalar_value). The strict reject
            // at this site was removed by the E8 amendment; concat-
            // continuation cases like `${a}-bar` rely on the absence of
            // that reject to extend the unquoted run after a value-token.
            // Path-element strict checks live elsewhere — see
            // parse_subst_body (this file) and parse_key (parser.rs).
            let mut value = String::new();
            while pos < chars.len() && is_unquoted_continue(chars[pos], || peek(pos, 1)) {
                value.push(chars[pos]);
                pos += 1;
                col += 1;
            }
            let trimmed = value.trim_end().to_string();
            tokens.push(Token {
                kind: TokenKind::Unquoted,
                value: trimmed,
                line: sl,
                col: sc,
                is_quoted: false,
                preceding_space: had_space,
                preceding_whitespace: std::mem::take(&mut whitespace_buffer),
                subst: None,
            });
            had_space = false;
            continue;
        }

        return Err(ParseError {
            message: format!("unexpected character: {:?}", ch),
            line: sl,
            col: sc,
        });
    }

    tokens.push(Token {
        kind: TokenKind::Eof,
        value: String::new(),
        line,
        col,
        is_quoted: false,
        preceding_space: false,
        preceding_whitespace: String::new(),
        subst: None,
    });
    Ok(tokens)
}

/// Read the body of a quoted string (opening `"` already consumed).
/// Returns the decoded string or a ParseError.
/// `open_line`/`open_col` are the position of the opening `"` for error reporting.
fn read_quoted_body(
    chars: &[char],
    pos: &mut usize,
    col: &mut usize,
    open_line: usize,
    open_col: usize,
) -> Result<String, ParseError> {
    let mut value = String::new();
    while *pos < chars.len() && chars[*pos] != '"' {
        if chars[*pos] == '\n' {
            return Err(ParseError {
                message: "unterminated string".into(),
                line: open_line,
                col: open_col,
            });
        }
        if chars[*pos] == '\\' {
            let esc_col = *col;
            *pos += 1;
            *col += 1;
            if *pos >= chars.len() {
                return Err(ParseError {
                    message: "unterminated string".into(),
                    line: open_line,
                    col: open_col,
                });
            }
            let esc = chars[*pos];
            *pos += 1;
            *col += 1;
            match esc {
                'n' => value.push('\n'),
                't' => value.push('\t'),
                'r' => value.push('\r'),
                '"' => value.push('"'),
                '\\' => value.push('\\'),
                '/' => value.push('/'),
                'b' => value.push('\u{0008}'),
                'f' => value.push('\u{000C}'),
                'u' => {
                    let hex: String = chars[*pos..].iter().take(4).collect();
                    if hex.len() < 4 || !hex.chars().all(|c| c.is_ascii_hexdigit()) {
                        return Err(ParseError {
                            message: "invalid unicode escape".into(),
                            line: open_line,
                            col: esc_col,
                        });
                    }
                    let code = u32::from_str_radix(&hex, 16).map_err(|_| ParseError {
                        message: "invalid unicode escape".into(),
                        line: open_line,
                        col: esc_col,
                    })?;
                    let c = char::from_u32(code).ok_or_else(|| ParseError {
                        message: "invalid unicode escape".into(),
                        line: open_line,
                        col: esc_col,
                    })?;
                    value.push(c);
                    *pos += 4;
                    *col += 4;
                }
                _ => {
                    return Err(ParseError {
                        message: "invalid escape sequence".into(),
                        line: open_line,
                        col: esc_col,
                    });
                }
            }
        } else {
            value.push(chars[*pos]);
            *pos += 1;
            *col += 1;
        }
    }
    if *pos >= chars.len() || chars[*pos] != '"' {
        return Err(ParseError {
            message: "unterminated string".into(),
            line: open_line,
            col: open_col,
        });
    }
    *pos += 1;
    *col += 1;
    Ok(value)
}

/// Returns true if `ch` is a valid unquoted character inside a `${...}` body.
/// Forbidden: any HOCON whitespace (full set per is_hocon_whitespace), `"`, `\`,
///            `{`, `}`, `[`, `]`, `:`, `=`, `,`, `+`, `#`, `` ` ``, `^`, `?`,
///            `!`, `@`, `*`, `&`, `$`, `.`.
fn is_unquoted_subst_char(ch: char) -> bool {
    if is_hocon_whitespace(ch) {
        return false;
    }
    !matches!(
        ch,
        '"' | '\\'
            | '{'
            | '}'
            | '['
            | ']'
            | ':'
            | '='
            | ','
            | '+'
            | '#'
            | '`'
            | '^'
            | '?'
            | '!'
            | '@'
            | '*'
            | '&'
            | '$'
            | '.'
    )
}

/// Consume the literal two-character sequence `[]` at the current position.
///
/// Called by `parse_subst_body` when the `[` arm fires. Expects `chars[*pos] == '['`
/// on entry. Strict: no whitespace inside the brackets (`${X[ ]}` is a lex error).
fn parse_literal_brackets(
    chars: &[char],
    pos: &mut usize,
    col: &mut usize,
    start_line: usize,
) -> Result<(), ParseError> {
    // Consume `[`.
    debug_assert!(*pos < chars.len() && chars[*pos] == '[');
    *pos += 1;
    *col += 1;
    // Next char must be `]` (no whitespace inside the brackets).
    if *pos >= chars.len() || chars[*pos] != ']' {
        let got = chars
            .get(*pos)
            .map(|c| c.escape_debug().to_string())
            .unwrap_or_else(|| "EOF".into());
        return Err(ParseError {
            message: format!(
                "expected ']' after '[' in substitution list suffix, got {}",
                got
            ),
            line: start_line,
            col: *col,
        });
    }
    *pos += 1;
    *col += 1;
    Ok(())
}

/// Parse the body of a `${...}` substitution (called after `${` has been consumed).
/// Returns the `SubstPayload` or a `ParseError`.
fn parse_subst_body(
    chars: &[char],
    pos: &mut usize,
    col: &mut usize,
    start_line: usize,
    start_col: usize,
) -> Result<SubstPayload, ParseError> {
    // Assumes `${` already consumed. Position is at char after `{`.

    // START: check for optional sigil
    let optional = if *pos < chars.len() && chars[*pos] == '?' {
        *pos += 1;
        *col += 1;
        true
    } else {
        false
    };

    // COLLECT
    // current segment state
    let mut cur_text = String::new();
    let mut cur_started = false;
    let mut cur_line = 0usize;
    let mut cur_col = 0usize;

    let mut pending_ws = String::new();
    let mut segments: Vec<Segment> = Vec::new();
    // Track last-seen DOT position for trailing-dot error reporting.
    let mut last_dot: Option<(usize, usize)> = None;
    // Set to true when a `[]` suffix is encountered (S13c env-var-list).
    let mut list_suffix = false;

    loop {
        if *pos >= chars.len() {
            return Err(ParseError {
                message: "unterminated substitution".into(),
                line: start_line,
                col: start_col,
            });
        }
        let ch = chars[*pos];

        match ch {
            '}' => {
                // END
                *pos += 1;
                *col += 1;
                // Drop pending_ws (trailing whitespace)
                pending_ws.clear();
                break;
            }
            '"' => {
                // QUOTED token
                let q_line = start_line; // all on same conceptual line (no literal newlines allowed)
                let q_col = *col;
                if cur_started {
                    cur_text.push_str(&pending_ws);
                }
                pending_ws.clear();
                *pos += 1;
                *col += 1;
                let decoded = read_quoted_body(chars, pos, col, q_line, q_col)?;
                cur_text.push_str(&decoded);
                if !cur_started {
                    cur_line = q_line;
                    cur_col = q_col;
                    cur_started = true;
                }
            }
            ch if is_unquoted_subst_char(ch) => {
                // S8.6 (HOCON.md L270–276) also applies to unquoted path
                // segments inside ${...}: a segment beginning with '-' must be
                // followed by a digit. Gate on `!cur_started` so the check
                // fires only at **segment start** — a `-` that follows a
                // quoted fragment in the same segment (e.g. `${"a"-foo}`
                // resolving the key `"a-foo"` via quoted/unquoted concat) is
                // not policed, mirroring how the existing `${"a"x}` flow
                // builds `"ax"`. Digit-leading segments are not policed here
                // either (consistent with the value-position rule and
                // rs.hocon's unquoted-only token model — see
                // docs/spec-compliance.md §S8.6).
                if ch == '-' && !cur_started {
                    let next = chars.get(*pos + 1).copied().unwrap_or('\0');
                    if !next.is_ascii_digit() {
                        let after = if next == '\0' {
                            String::from("EOF")
                        } else {
                            format!("{:?}", next)
                        };
                        return Err(ParseError {
                            message: format!(
                                "unquoted path segment cannot begin with '-' unless followed by a digit (got '-' then {}, HOCON.md L270-276)",
                                after
                            ),
                            line: start_line,
                            col: *col,
                        });
                    }
                }
                // UNQUOTED token: read a run of unquoted chars
                let uq_col = *col;
                if cur_started {
                    cur_text.push_str(&pending_ws);
                }
                pending_ws.clear();
                if !cur_started {
                    cur_line = start_line;
                    cur_col = uq_col;
                    cur_started = true;
                }
                while *pos < chars.len() && is_unquoted_subst_char(chars[*pos]) {
                    cur_text.push(chars[*pos]);
                    *pos += 1;
                    *col += 1;
                }
            }
            '.' => {
                // DOT: flush current segment (or error if not started)
                let dot_col = *col;
                pending_ws.clear();
                if !cur_started {
                    return Err(ParseError {
                        message: "empty segment in path".into(),
                        line: start_line,
                        col: dot_col,
                    });
                }
                segments.push(Segment {
                    text: std::mem::take(&mut cur_text),
                    line: cur_line,
                    col: cur_col,
                });
                cur_started = false;
                cur_line = 0;
                cur_col = 0;
                last_dot = Some((start_line, dot_col));
                *pos += 1;
                *col += 1;
            }
            '[' => {
                // S13c: `[]` suffix — end of path expression, start of list-suffix.
                // Two convergent multi-impl checks (mirrors go.hocon + ts.hocon fixes):
                //
                //   (a) Empty-segment guard: error if no segment has been started AND
                //       either there are no segments yet (`${[]}` / `${ []}`) or a
                //       trailing dot was just consumed (`${X.[]}` / `${X . []}`).
                //       Both reduce to `!cur_started` — uniform error.
                //   (b) E7 narrow: pending_ws may contain only ASCII SPACE (0x20) or
                //       TAB (0x09). Wider HOCON whitespace (NBSP, CR, Zs, BOM, …) is
                //       accumulated by the broader inter-token WS arm below (S6 set)
                //       but is rejected here for the `[` boundary per extra-spec E7
                //       ("narrow allow-list intentionally avoids semantic surprise").
                if !cur_started {
                    return Err(ParseError {
                        message: "empty segment before '[]' suffix in substitution".into(),
                        line: start_line,
                        col: *col,
                    });
                }
                for w in pending_ws.chars() {
                    if w != ' ' && w != '\t' {
                        return Err(ParseError {
                            message: format!(
                                "only ASCII space or tab allowed between substitution path and '[]' suffix (got {:?}, HOCON extra-spec E7)",
                                w
                            ),
                            line: start_line,
                            col: *col,
                        });
                    }
                }
                // Flush in-progress unquoted segment (same as the `}` path).
                segments.push(Segment {
                    text: std::mem::take(&mut cur_text),
                    line: cur_line,
                    col: cur_col,
                });
                cur_started = false;
                // E7-conformant pending_ws is intentionally discarded.
                pending_ws.clear();
                // Consume the literal `[]`.
                parse_literal_brackets(chars, pos, col, start_line)?;
                list_suffix = true;
                // After `[]` the only legal next char is `}`.
                if *pos >= chars.len() || chars[*pos] != '}' {
                    return Err(ParseError {
                        message: "expected '}' after '[]' in substitution".into(),
                        line: start_line,
                        col: *col,
                    });
                }
                *pos += 1;
                *col += 1;
                break;
            }
            ch if is_hocon_whitespace(ch) && !is_hocon_newline(ch) => {
                // Inter-token whitespace (full HOCON_WS minus LF): buffer into
                // pending_ws; column advances but line is unchanged.
                pending_ws.push(ch);
                *pos += 1;
                *col += 1;
            }
            '\n' => {
                // LF inside ${...} is not allowed (unterminated substitution).
                return Err(ParseError {
                    message: "unterminated substitution".into(),
                    line: start_line,
                    col: start_col,
                });
            }
            other => {
                return Err(ParseError {
                    message: format!(
                        "unexpected character in substitution path: {}",
                        other.escape_debug()
                    ),
                    line: start_line,
                    col: *col,
                });
            }
        }
    }

    // END validation (only reached via `}` break; `[]` break already pushes segment).
    if cur_started {
        segments.push(Segment {
            text: cur_text,
            line: cur_line,
            col: cur_col,
        });
    } else if segments.is_empty() {
        // ${}
        return Err(ParseError {
            message: "empty substitution path".into(),
            line: start_line,
            col: start_col,
        });
    } else if !list_suffix {
        // trailing dot: ${foo.} — report at the offending dot position.
        // Not an error when list_suffix=true; the `[]` arm already flushed.
        let (err_line, err_col) = last_dot.unwrap_or((start_line, start_col));
        return Err(ParseError {
            message: "empty segment in path".into(),
            line: err_line,
            col: err_col,
        });
    }

    Ok(SubstPayload {
        segments,
        optional,
        list_suffix,
    })
}

fn is_unquoted_start(ch: char) -> bool {
    if is_hocon_whitespace(ch) {
        return false;
    }
    !matches!(
        ch,
        '{' | '}'
            | '['
            | ']'
            | ','
            | ':'
            | '='
            | '+'
            | '#'
            | '"'
            | '$'
            | '?'
            | '!'
            | '@'
            | '*'
            | '&'
            | '^'
            | '\\'
    )
}

fn is_unquoted_continue(ch: char, next_fn: impl Fn() -> char) -> bool {
    if is_hocon_whitespace(ch) {
        return false;
    }
    if matches!(
        ch,
        '{' | '}'
            | '['
            | ']'
            | ','
            | ':'
            | '='
            | '#'
            | '"'
            | '$'
            | '?'
            | '!'
            | '@'
            | '*'
            | '&'
            | '^'
            | '\\'
    ) {
        return false;
    }
    if ch == '+' && next_fn() == '=' {
        return false;
    }
    if ch == '/' && next_fn() == '/' {
        return false;
    }
    true
}

#[cfg(test)]
mod tests {
    use super::*;

    fn kinds(input: &str) -> Vec<TokenKind> {
        tokenize(input)
            .unwrap()
            .iter()
            .map(|t| t.kind.clone())
            .collect()
    }

    fn first(input: &str) -> Token {
        tokenize(input).unwrap().into_iter().next().unwrap()
    }

    #[test]
    fn tokenizes_empty_string() {
        let tokens = tokenize("").unwrap();
        assert_eq!(tokens.len(), 1);
        assert_eq!(tokens[0].kind, TokenKind::Eof);
    }

    #[test]
    fn tokenizes_braces_and_brackets() {
        assert_eq!(
            kinds("{}[]"),
            vec![
                TokenKind::LBrace,
                TokenKind::RBrace,
                TokenKind::LBracket,
                TokenKind::RBracket,
                TokenKind::Eof
            ]
        );
    }

    #[test]
    fn tokenizes_equals_and_plus_equals() {
        let tokens = tokenize("=+=").unwrap();
        assert_eq!(tokens[0].kind, TokenKind::Equals);
        assert_eq!(tokens[1].kind, TokenKind::PlusEquals);
    }

    #[test]
    fn tokenizes_colon_and_comma() {
        assert_eq!(
            kinds(":,"),
            vec![TokenKind::Colon, TokenKind::Comma, TokenKind::Eof]
        );
    }

    #[test]
    fn skips_slash_comments_keeps_newline() {
        let tokens = tokenize("// comment\nfoo").unwrap();
        assert_eq!(tokens[0].kind, TokenKind::Newline);
        assert_eq!(tokens[1].kind, TokenKind::Unquoted);
        assert_eq!(tokens[1].value, "foo");
    }

    #[test]
    fn skips_hash_comments() {
        let tokens = tokenize("# comment\nfoo").unwrap();
        assert_eq!(tokens[0].kind, TokenKind::Newline);
        assert_eq!(tokens[1].value, "foo");
    }

    #[test]
    fn tokenizes_quoted_strings() {
        let t = first("\"hello world\"");
        assert_eq!(t.kind, TokenKind::QuotedString);
        assert_eq!(t.value, "hello world");
        assert!(t.is_quoted);
    }

    #[test]
    fn handles_escape_sequences() {
        let t = first("\"a\\nb\\tc\"");
        assert_eq!(t.value, "a\nb\tc");
    }

    #[test]
    fn handles_unicode_escapes() {
        let t = first("\"\\u0041\"");
        assert_eq!(t.value, "A");
    }

    #[test]
    fn tokenizes_triple_quoted_strings() {
        let t = first("\"\"\"hello\nworld\"\"\"");
        assert_eq!(t.kind, TokenKind::TripleQuotedString);
        assert_eq!(t.value, "hello\nworld");
        assert!(t.is_quoted);
    }

    #[test]
    fn strips_leading_newline_from_triple_quoted() {
        let t = first("\"\"\"\nhello\"\"\"");
        assert_eq!(t.value, "hello");
    }

    #[test]
    fn tokenizes_unquoted_strings() {
        let t = first("localhost");
        assert_eq!(t.kind, TokenKind::Unquoted);
        assert_eq!(t.value, "localhost");
        assert!(!t.is_quoted);
    }

    #[test]
    fn tokenizes_numbers_as_unquoted() {
        let t = first("8080");
        assert_eq!(t.kind, TokenKind::Unquoted);
        assert_eq!(t.value, "8080");
    }

    #[test]
    fn tokenizes_substitutions() {
        let t = first("${server.host}");
        assert_eq!(t.kind, TokenKind::Substitution);
        assert_eq!(t.value, "server.host");
    }

    #[test]
    fn tokenizes_optional_substitutions() {
        let t = first("${?foo}");
        assert_eq!(t.kind, TokenKind::Substitution);
        assert_eq!(t.value, "foo");
        assert!(t.subst.as_ref().unwrap().optional);
    }

    #[test]
    fn tokenizes_newlines() {
        let tokens = tokenize("a\nb").unwrap();
        assert_eq!(tokens[1].kind, TokenKind::Newline);
    }

    #[test]
    fn deduplicates_consecutive_newlines() {
        let tokens = tokenize("a\n\n\nb").unwrap();
        let newlines: Vec<_> = tokens
            .iter()
            .filter(|t| t.kind == TokenKind::Newline)
            .collect();
        assert_eq!(newlines.len(), 1);
    }

    #[test]
    fn tracks_line_and_col() {
        let tokens = tokenize("a\nb").unwrap();
        assert_eq!(tokens[0].line, 1);
        assert_eq!(tokens[0].col, 1);
        assert_eq!(tokens[2].line, 2);
        assert_eq!(tokens[2].col, 1);
    }

    #[test]
    fn sets_preceding_space() {
        let tokens = tokenize("a b").unwrap();
        assert!(tokens[1].preceding_space);
        assert!(!tokens[0].preceding_space);
    }

    #[test]
    fn strips_utf8_bom() {
        let tokens = tokenize("\u{FEFF}foo").unwrap();
        assert_eq!(tokens[0].value, "foo");
    }

    #[test]
    fn stops_unquoted_at_dollar_for_concat() {
        let tokens = tokenize("foo${bar}").unwrap();
        assert_eq!(tokens[0].kind, TokenKind::Unquoted);
        assert_eq!(tokens[0].value, "foo");
        assert_eq!(tokens[1].kind, TokenKind::Substitution);
        assert_eq!(tokens[1].value, "bar");
        assert!(!tokens[1].preceding_space);
    }

    #[test]
    fn throws_on_unterminated_string() {
        assert!(tokenize("\"unterminated").is_err());
    }

    #[test]
    fn throws_on_unterminated_substitution() {
        assert!(tokenize("${foo").is_err());
    }

    #[test]
    fn throws_on_unterminated_triple_quoted_string() {
        assert!(tokenize(r#""""unterminated"#).is_err());
    }

    // -------------------------------------------------------------------------
    // Spec compliance Phase 1 (issue #60): lexer-level rules.
    //
    // Each test is annotated with its xx.hocon spec checklist ID (S<n>.<m>).
    //
    // Convention for known spec violations:
    //   - The spec-correct test is annotated with #[ignore = "spec violation, see #NN"].
    //     CI stays green while the impl is buggy; removing the attribute once a fix
    //     lands flips the test to required-pass.
    //   - Where the ambiguity of it.fails()-equivalent is high (e.g., S6.x where
    //     a "fix" could plausibly reject or accept), a companion `_pin` test (no
    //     #[ignore]) asserts the *current* broken behavior as a regression net.
    // -------------------------------------------------------------------------

    // --- S2.3: comment markers inside quoted strings are literal -------------
    // Spec L126: "//" and "#" inside double-quoted strings must NOT be treated as
    // comment starters — they are literal string content.
    #[test]
    fn s2_3_comment_markers_inside_quoted_string_are_literal() {
        // "http://example.com" — the "//" must not start a comment
        let tokens = tokenize(r#""http://example.com""#).unwrap();
        assert_eq!(tokens[0].kind, TokenKind::QuotedString);
        assert_eq!(tokens[0].value, "http://example.com");

        // "# not a comment" — the "#" must not start a comment
        let tokens = tokenize("\"# not a comment\"").unwrap();
        assert_eq!(tokens[0].kind, TokenKind::QuotedString);
        assert_eq!(tokens[0].value, "# not a comment");
    }

    // --- S6.1: Unicode Zs / Zl / Zp category chars are whitespace -----------
    // Spec L170: the lexer must treat any Unicode whitespace category character
    // (Zs, Zl, Zp) as a token separator, not as unquoted string content.
    // All Zs/Zl/Zp members are covered by is_hocon_whitespace.
    //
    // Spec-correct test: em space must separate two unquoted tokens.
    #[test]
    fn s6_1_em_space_separates_tokens_spec() {
        let tokens = tokenize("a\u{2003}b").unwrap();
        let unquoted: Vec<_> = tokens
            .iter()
            .filter(|t| t.kind == TokenKind::Unquoted)
            .collect();
        assert_eq!(unquoted.len(), 2, "em space should separate two tokens");
        assert_eq!(unquoted[0].value, "a");
        assert_eq!(unquoted[1].value, "b");
    }

    // Spec-correct test: line separator (U+2028, Zl) must be whitespace.
    #[test]
    fn s6_1_line_separator_separates_tokens_spec() {
        let tokens = tokenize("a\u{2028}b").unwrap();
        let unquoted: Vec<_> = tokens
            .iter()
            .filter(|t| t.kind == TokenKind::Unquoted)
            .collect();
        assert_eq!(unquoted.len(), 2, "U+2028 (Zl) should separate two tokens");
        assert_eq!(unquoted[0].value, "a");
        assert_eq!(unquoted[1].value, "b");
    }

    // --- S6.2: non-breaking spaces are whitespace ----------------------------
    // Spec L171: U+00A0 (NBSP), U+2007 (figure space), U+202F (narrow NBSP)
    // must be treated as whitespace. All three are in is_hocon_whitespace.

    // Spec-correct test: NBSP (U+00A0) must separate tokens.
    #[test]
    fn s6_2_nbsp_separates_tokens_spec() {
        let tokens = tokenize("a\u{00A0}b").unwrap();
        let unquoted: Vec<_> = tokens
            .iter()
            .filter(|t| t.kind == TokenKind::Unquoted)
            .collect();
        assert_eq!(unquoted.len(), 2, "NBSP should separate two tokens");
        assert_eq!(unquoted[0].value, "a");
        assert_eq!(unquoted[1].value, "b");
    }

    // Spec-correct test: figure space (U+2007) must separate tokens.
    #[test]
    fn s6_2_figure_space_separates_tokens_spec() {
        let tokens = tokenize("a\u{2007}b").unwrap();
        let unquoted: Vec<_> = tokens
            .iter()
            .filter(|t| t.kind == TokenKind::Unquoted)
            .collect();
        assert_eq!(unquoted.len(), 2, "figure space should separate two tokens");
        assert_eq!(unquoted[0].value, "a");
        assert_eq!(unquoted[1].value, "b");
    }

    // Spec-correct test: narrow NBSP (U+202F) must separate tokens.
    #[test]
    fn s6_2_narrow_nbsp_separates_tokens_spec() {
        let tokens = tokenize("a\u{202F}b").unwrap();
        let unquoted: Vec<_> = tokens
            .iter()
            .filter(|t| t.kind == TokenKind::Unquoted)
            .collect();
        assert_eq!(unquoted.len(), 2, "narrow NBSP should separate two tokens");
        assert_eq!(unquoted[0].value, "a");
        assert_eq!(unquoted[1].value, "b");
    }

    // --- S6.4: ASCII control whitespace --------------------------------------
    // Spec L174 lists 8 chars that are whitespace: tab (0x09), vtab (0x0B),
    // FF (0x0C), CR (0x0D), FS (0x1C), GS (0x1D), RS (0x1E), US (0x1F).
    // All 8 are now covered by is_hocon_whitespace.

    #[test]
    fn s6_4_tab_is_whitespace() {
        // Tab (0x09): in the HOCON whitespace set.
        let tokens = tokenize("a\tb").unwrap();
        let unquoted: Vec<_> = tokens
            .iter()
            .filter(|t| t.kind == TokenKind::Unquoted)
            .collect();
        assert_eq!(unquoted.len(), 2);
        assert_eq!(unquoted[0].value, "a");
        assert_eq!(unquoted[1].value, "b");
    }

    #[test]
    fn s6_4_cr_is_whitespace() {
        // CR (0x0D): in the HOCON whitespace set.
        // CR alone (without LF) acts as inter-token whitespace, not a newline emitter.
        let tokens = tokenize("a\rb").unwrap();
        let unquoted: Vec<_> = tokens
            .iter()
            .filter(|t| t.kind == TokenKind::Unquoted)
            .collect();
        assert_eq!(unquoted.len(), 2);
        assert_eq!(unquoted[0].value, "a");
        assert_eq!(unquoted[1].value, "b");
    }

    // Spec-correct test: vtab (0x0B) must be whitespace.
    #[test]
    fn s6_4_vtab_is_whitespace_spec() {
        let tokens = tokenize("a\x0Bb").unwrap();
        let unquoted: Vec<_> = tokens
            .iter()
            .filter(|t| t.kind == TokenKind::Unquoted)
            .collect();
        assert_eq!(unquoted.len(), 2, "vtab should separate tokens");
        assert_eq!(unquoted[0].value, "a");
        assert_eq!(unquoted[1].value, "b");
    }

    // Spec-correct test: form feed (0x0C) must be whitespace.
    #[test]
    fn s6_4_ff_is_whitespace_spec() {
        let tokens = tokenize("a\x0Cb").unwrap();
        let unquoted: Vec<_> = tokens
            .iter()
            .filter(|t| t.kind == TokenKind::Unquoted)
            .collect();
        assert_eq!(unquoted.len(), 2, "FF should separate tokens");
        assert_eq!(unquoted[0].value, "a");
        assert_eq!(unquoted[1].value, "b");
    }

    // Spec-correct test: FS, GS, RS, US (0x1C–0x1F) must be whitespace.
    // These are grouped because they share the same root cause (not in the
    // lexer's whitespace check) and the same fix will address all four.
    #[test]
    fn s6_4_fs_gs_rs_us_are_whitespace_spec() {
        for (label, ch) in [
            ("FS (0x1C)", '\x1C'),
            ("GS (0x1D)", '\x1D'),
            ("RS (0x1E)", '\x1E'),
            ("US (0x1F)", '\x1F'),
        ] {
            let input = format!("a{}b", ch);
            let tokens = tokenize(&input).unwrap();
            let unquoted: Vec<_> = tokens
                .iter()
                .filter(|t| t.kind == TokenKind::Unquoted)
                .collect();
            assert_eq!(unquoted.len(), 2, "{label} should separate tokens");
            assert_eq!(unquoted[0].value, "a", "{label}");
            assert_eq!(unquoted[1].value, "b", "{label}");
        }
    }

    // --- LF regression guard: LF must still emit Newline token ---------------
    // After predicate centralization, is_hocon_whitespace returns true for LF.
    // The newline branch must check BEFORE the whitespace skip so LF still
    // produces TokenKind::Newline (per spec §D, design invariant).
    #[test]
    fn s6_lf_still_emits_newline_token() {
        let tokens = tokenize("a\nb").unwrap();
        assert!(
            tokens.iter().any(|t| matches!(t.kind, TokenKind::Newline)),
            "LF must still emit a Newline token after whitespace predicate centralization"
        );
    }

    // --- S6.3 (broadened): BOM mid-stream is whitespace ----------------------
    // Spec L173: BOM (U+FEFF) is whitespace, not a start-of-input marker.
    // The lexer still strips BOM at char index 0 (harmless redundancy), and
    // BOM mid-stream is now consumed as inter-token whitespace via
    // is_hocon_whitespace.
    //
    // Spec-correct test: BOM mid-stream must separate two unquoted tokens.
    #[test]
    fn s6_3_bom_midstream_is_whitespace() {
        let tokens = tokenize("a\u{FEFF}b").unwrap();
        let unquoted: Vec<_> = tokens
            .iter()
            .filter(|t| t.kind == TokenKind::Unquoted)
            .collect();
        assert_eq!(
            unquoted.len(),
            2,
            "BOM mid-stream should separate two tokens"
        );
        assert_eq!(unquoted[0].value, "a");
        assert_eq!(unquoted[1].value, "b");
    }

    // --- S8.6 / E8: unquoted string begin rules (post-E8 amendment) ---------
    //
    // E8 amendment (xx.hocon#31 / commit dd102e8) reads HOCON.md L270-276
    // "begin" as value-position begin (first component of a concatenation),
    // not token-position begin at any lexer offset. At value-start:
    //   - the lexer reads the entire run as a single unquoted token (no
    //     separate number token kind); numeric coercion happens later in
    //     parse_scalar_value. Tokens that don't parse as numbers (e.g.
    //     `123abc`) stay as strings.
    //   - `-` not followed by a digit is treated as the start of an unquoted
    //     run (the strict reject at the lexer was removed per E8).
    // Path-element rules (substitution body, dotted key segments) remain
    // strict — covered in tests/s8_unquoted_starts.rs.

    #[test]
    fn e8_value_start_digit_leading_with_letters_is_string() {
        // `123abc` is not a valid number; parse_scalar_value falls back to
        // ScalarType::String. Same observable behavior as Lightbend (whose
        // parseLong/parseFloat both fail and produce an unquoted concat).
        // Assert the resolved value (not just is_ok) so accidental coercion
        // or truncation would surface here.
        let cfg = crate::parse("x = 123abc").expect("parse failed");
        assert_eq!(
            cfg.get_string("x").expect("x not found"),
            "123abc",
            "E8: `123abc` must lex+resolve as unquoted string \"123abc\""
        );
    }

    #[test]
    fn e8_value_start_hyphen_leading_non_number_is_string() {
        // Pre-E8 this was a lex error (S8.6 strict reading). Post-E8, `-foo`
        // is an unquoted string at value-position — RFC 8259 JSON-number
        // requires a digit after `-`, so bare `-foo` falls outside L270's
        // disallow scope. Lightbend reference produces `{"x":"-foo"}`.
        // Assert the resolved value (not just is_ok) so accidental coercion
        // or truncation would surface here.
        let cfg = crate::parse("x = -foo").expect("parse failed");
        assert_eq!(
            cfg.get_string("x").expect("x not found"),
            "-foo",
            "E8: `-foo` must lex+resolve as unquoted string \"-foo\""
        );
    }

    // --- S8.7: no escape sequences in unquoted strings -----------------------
    // Spec L253: unquoted strings do not interpret any escape sequences.
    // A backslash inside an unquoted run is forbidden (it terminates the run
    // in rs.hocon because '\' is excluded from is_unquoted_start and
    // is_unquoted_continue), and the bare backslash produces a lexer error.
    #[test]
    fn s8_7_backslash_is_rejected_in_unquoted_context() {
        // "a\n" outside quotes: the lexer reads 'a' as unquoted, then hits '\',
        // which is not a valid unquoted character and not a recognised token
        // introducer — the lexer should error.
        assert!(
            tokenize(r"a\n").is_err(),
            "bare backslash outside quotes must be rejected"
        );
    }

    // --- S8.8: unquoted strings allow control chars except forbidden set -----
    // Spec L280: control characters OTHER than the forbidden set (L245:
    // $ " { } [ ] : = , + # ` ^ ? ! @ * & \ and whitespace are permitted
    // inside unquoted strings.
    #[test]
    fn s8_8_soh_allowed_in_unquoted_string() {
        // SOH (0x01) is a control character not in the forbidden set.
        let tokens = tokenize("foo\x01bar").unwrap();
        let unquoted: Vec<_> = tokens
            .iter()
            .filter(|t| t.kind == TokenKind::Unquoted)
            .collect();
        assert_eq!(unquoted.len(), 1);
        assert_eq!(unquoted[0].value, "foo\x01bar");
    }

    #[test]
    fn s8_8_bel_allowed_in_unquoted_string() {
        // BEL (0x07) is a control character not in the forbidden set.
        let tokens = tokenize("foo\x07bar").unwrap();
        let unquoted: Vec<_> = tokens
            .iter()
            .filter(|t| t.kind == TokenKind::Unquoted)
            .collect();
        assert_eq!(unquoted.len(), 1);
        assert_eq!(unquoted[0].value, "foo\x07bar");
    }
}