1use crate::error::ParseError;
2
3#[derive(Debug, Clone, PartialEq)]
4pub enum TokenKind {
5 LBrace,
6 RBrace,
7 LBracket,
8 RBracket,
9 Comma,
10 Colon,
11 Equals,
12 PlusEquals,
13 Newline,
14 QuotedString,
15 TripleQuotedString,
16 Unquoted,
17 Substitution,
18 Eof,
19}
20
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub struct Segment {
23 pub text: String,
24 pub line: usize,
25 pub col: usize,
26}
27
28#[non_exhaustive]
34#[derive(Debug, Clone)]
35pub struct SubstPayload {
36 pub segments: Vec<Segment>,
37 pub optional: bool,
38 pub list_suffix: bool,
41}
42
43#[derive(Debug, Clone)]
52#[non_exhaustive]
53pub struct Token {
54 pub kind: TokenKind,
55 pub value: String,
56 pub line: usize,
57 pub col: usize,
58 #[allow(dead_code)]
59 pub is_quoted: bool,
60 pub preceding_space: bool,
62 pub preceding_whitespace: String,
75 pub subst: Option<SubstPayload>,
76}
77
78pub(crate) fn is_hocon_whitespace(ch: char) -> bool {
97 matches!(ch,
98 '\t' | '\n' | '\u{000B}' | '\u{000C}' | '\r'
99 | '\u{001C}'..='\u{001F}'
100 | ' ' | '\u{00A0}' | '\u{FEFF}'
101 | '\u{1680}'
102 | '\u{2000}'..='\u{200A}'
103 | '\u{2028}' | '\u{2029}' | '\u{202F}' | '\u{205F}'
104 | '\u{3000}'
105 )
106}
107
108fn is_hocon_newline(ch: char) -> bool {
114 ch == '\n'
115}
116
117pub fn tokenize(input: &str) -> Result<Vec<Token>, ParseError> {
118 let chars: Vec<char> = input.chars().collect();
119 let mut tokens = Vec::new();
120 let mut pos = 0usize;
121 let mut line = 1usize;
122 let mut col = 1usize;
123 let mut had_space = false;
124 let mut whitespace_buffer = String::new();
128
129 if !chars.is_empty() && chars[0] == '\u{FEFF}' {
131 pos = 1;
132 }
133
134 let peek =
135 |pos: usize, offset: usize| -> char { chars.get(pos + offset).copied().unwrap_or('\0') };
136
137 while pos < chars.len() {
138 let sl = line;
139 let sc = col;
140 let ch = chars[pos];
141
142 if is_hocon_newline(ch) {
145 pos += 1;
146 line += 1;
147 col = 1;
148 if tokens
149 .last()
150 .is_none_or(|t: &Token| t.kind != TokenKind::Newline)
151 {
152 tokens.push(Token {
153 kind: TokenKind::Newline,
154 value: "\n".into(),
155 line: sl,
156 col: sc,
157 is_quoted: false,
158 preceding_space: had_space,
159 preceding_whitespace: std::mem::take(&mut whitespace_buffer),
160 subst: None,
161 });
162 had_space = false;
163 }
164 continue;
165 }
166
167 if is_hocon_whitespace(ch) {
169 whitespace_buffer.push(ch);
170 pos += 1;
171 col += 1;
172 had_space = true;
173 continue;
174 }
175
176 if ch == '/' && peek(pos, 1) == '/' {
178 while pos < chars.len() && chars[pos] != '\n' {
179 pos += 1;
180 col += 1;
181 }
182 had_space = true;
183 continue;
184 }
185 if ch == '#' {
186 while pos < chars.len() && chars[pos] != '\n' {
187 pos += 1;
188 col += 1;
189 }
190 had_space = true;
191 continue;
192 }
193
194 let single_kind = match ch {
196 '{' => Some(TokenKind::LBrace),
197 '}' => Some(TokenKind::RBrace),
198 '[' => Some(TokenKind::LBracket),
199 ']' => Some(TokenKind::RBracket),
200 ',' => Some(TokenKind::Comma),
201 ':' => Some(TokenKind::Colon),
202 _ => None,
203 };
204 if let Some(kind) = single_kind {
205 pos += 1;
206 col += 1;
207 tokens.push(Token {
208 kind,
209 value: ch.to_string(),
210 line: sl,
211 col: sc,
212 is_quoted: false,
213 preceding_space: had_space,
214 preceding_whitespace: std::mem::take(&mut whitespace_buffer),
215 subst: None,
216 });
217 had_space = false;
218 continue;
219 }
220
221 if ch == '=' {
223 pos += 1;
224 col += 1;
225 tokens.push(Token {
226 kind: TokenKind::Equals,
227 value: "=".into(),
228 line: sl,
229 col: sc,
230 is_quoted: false,
231 preceding_space: had_space,
232 preceding_whitespace: std::mem::take(&mut whitespace_buffer),
233 subst: None,
234 });
235 had_space = false;
236 continue;
237 }
238 if ch == '+' && peek(pos, 1) == '=' {
239 pos += 2;
240 col += 2;
241 tokens.push(Token {
242 kind: TokenKind::PlusEquals,
243 value: "+=".into(),
244 line: sl,
245 col: sc,
246 is_quoted: false,
247 preceding_space: had_space,
248 preceding_whitespace: std::mem::take(&mut whitespace_buffer),
249 subst: None,
250 });
251 had_space = false;
252 continue;
253 }
254
255 if ch == '$' && peek(pos, 1) == '{' {
257 pos += 2;
258 col += 2;
259 let payload = parse_subst_body(&chars, &mut pos, &mut col, sl, sc)?;
260 let value = payload
263 .segments
264 .iter()
265 .map(|s| {
266 let t = &s.text;
267 if t.is_empty()
268 || t.contains('.')
269 || t.contains(' ')
270 || t.contains('\t')
271 || t.contains('"')
272 || t.contains('\\')
273 || t != t.trim()
274 {
275 let escaped = t.replace('\\', "\\\\").replace('"', "\\\"");
276 format!("\"{}\"", escaped)
277 } else {
278 t.clone()
279 }
280 })
281 .collect::<Vec<_>>()
282 .join(".");
283 tokens.push(Token {
284 kind: TokenKind::Substitution,
285 value,
286 line: sl,
287 col: sc,
288 is_quoted: false,
289 preceding_space: had_space,
290 preceding_whitespace: std::mem::take(&mut whitespace_buffer),
291 subst: Some(payload),
292 });
293 had_space = false;
294 continue;
295 }
296
297 if ch == '"' && peek(pos, 1) == '"' && peek(pos, 2) == '"' {
299 pos += 3;
300 col += 3;
301 let mut value = String::new();
302 let mut found_closing = false;
303 loop {
304 if pos >= chars.len() {
305 break;
306 }
307 if chars[pos] == '"' {
308 let mut quote_count = 0;
309 while pos < chars.len() && chars[pos] == '"' {
310 quote_count += 1;
311 pos += 1;
312 col += 1;
313 }
314 if quote_count >= 3 {
315 for _ in 0..(quote_count - 3) {
316 value.push('"');
317 }
318 found_closing = true;
319 break;
320 }
321 for _ in 0..quote_count {
322 value.push('"');
323 }
324 continue;
325 }
326 if chars[pos] == '\n' {
327 line += 1;
328 col = 1;
329 } else {
330 col += 1;
331 }
332 value.push(chars[pos]);
333 pos += 1;
334 }
335 if !found_closing {
336 return Err(ParseError {
337 message: "unterminated triple-quoted string".into(),
338 line: sl,
339 col: sc,
340 });
341 }
342 if value.starts_with('\n') {
343 value = value[1..].to_string();
344 }
345 tokens.push(Token {
346 kind: TokenKind::TripleQuotedString,
347 value,
348 line: sl,
349 col: sc,
350 is_quoted: true,
351 preceding_space: had_space,
352 preceding_whitespace: std::mem::take(&mut whitespace_buffer),
353 subst: None,
354 });
355 had_space = false;
356 continue;
357 }
358
359 if ch == '"' {
361 pos += 1;
362 col += 1;
363 let value = read_quoted_body(&chars, &mut pos, &mut col, sl, sc)?;
364 tokens.push(Token {
365 kind: TokenKind::QuotedString,
366 value,
367 line: sl,
368 col: sc,
369 is_quoted: true,
370 preceding_space: had_space,
371 preceding_whitespace: std::mem::take(&mut whitespace_buffer),
372 subst: None,
373 });
374 had_space = false;
375 continue;
376 }
377
378 if is_unquoted_start(ch) {
380 let mut value = String::new();
394 while pos < chars.len() && is_unquoted_continue(chars[pos], || peek(pos, 1)) {
395 value.push(chars[pos]);
396 pos += 1;
397 col += 1;
398 }
399 let trimmed = value.trim_end().to_string();
400 tokens.push(Token {
401 kind: TokenKind::Unquoted,
402 value: trimmed,
403 line: sl,
404 col: sc,
405 is_quoted: false,
406 preceding_space: had_space,
407 preceding_whitespace: std::mem::take(&mut whitespace_buffer),
408 subst: None,
409 });
410 had_space = false;
411 continue;
412 }
413
414 return Err(ParseError {
415 message: format!("unexpected character: {:?}", ch),
416 line: sl,
417 col: sc,
418 });
419 }
420
421 tokens.push(Token {
422 kind: TokenKind::Eof,
423 value: String::new(),
424 line,
425 col,
426 is_quoted: false,
427 preceding_space: false,
428 preceding_whitespace: String::new(),
429 subst: None,
430 });
431 Ok(tokens)
432}
433
434fn read_quoted_body(
438 chars: &[char],
439 pos: &mut usize,
440 col: &mut usize,
441 open_line: usize,
442 open_col: usize,
443) -> Result<String, ParseError> {
444 let mut value = String::new();
445 while *pos < chars.len() && chars[*pos] != '"' {
446 if chars[*pos] == '\n' {
447 return Err(ParseError {
448 message: "unterminated string".into(),
449 line: open_line,
450 col: open_col,
451 });
452 }
453 if chars[*pos] == '\\' {
454 let esc_col = *col;
455 *pos += 1;
456 *col += 1;
457 if *pos >= chars.len() {
458 return Err(ParseError {
459 message: "unterminated string".into(),
460 line: open_line,
461 col: open_col,
462 });
463 }
464 let esc = chars[*pos];
465 *pos += 1;
466 *col += 1;
467 match esc {
468 'n' => value.push('\n'),
469 't' => value.push('\t'),
470 'r' => value.push('\r'),
471 '"' => value.push('"'),
472 '\\' => value.push('\\'),
473 '/' => value.push('/'),
474 'b' => value.push('\u{0008}'),
475 'f' => value.push('\u{000C}'),
476 'u' => {
477 let hex: String = chars[*pos..].iter().take(4).collect();
478 if hex.len() < 4 || !hex.chars().all(|c| c.is_ascii_hexdigit()) {
479 return Err(ParseError {
480 message: "invalid unicode escape".into(),
481 line: open_line,
482 col: esc_col,
483 });
484 }
485 let code = u32::from_str_radix(&hex, 16).map_err(|_| ParseError {
486 message: "invalid unicode escape".into(),
487 line: open_line,
488 col: esc_col,
489 })?;
490 let c = char::from_u32(code).ok_or_else(|| ParseError {
491 message: "invalid unicode escape".into(),
492 line: open_line,
493 col: esc_col,
494 })?;
495 value.push(c);
496 *pos += 4;
497 *col += 4;
498 }
499 _ => {
500 return Err(ParseError {
501 message: "invalid escape sequence".into(),
502 line: open_line,
503 col: esc_col,
504 });
505 }
506 }
507 } else {
508 value.push(chars[*pos]);
509 *pos += 1;
510 *col += 1;
511 }
512 }
513 if *pos >= chars.len() || chars[*pos] != '"' {
514 return Err(ParseError {
515 message: "unterminated string".into(),
516 line: open_line,
517 col: open_col,
518 });
519 }
520 *pos += 1;
521 *col += 1;
522 Ok(value)
523}
524
525fn is_unquoted_subst_char(ch: char) -> bool {
530 if is_hocon_whitespace(ch) {
531 return false;
532 }
533 !matches!(
534 ch,
535 '"' | '\\'
536 | '{'
537 | '}'
538 | '['
539 | ']'
540 | ':'
541 | '='
542 | ','
543 | '+'
544 | '#'
545 | '`'
546 | '^'
547 | '?'
548 | '!'
549 | '@'
550 | '*'
551 | '&'
552 | '$'
553 | '.'
554 )
555}
556
557fn parse_literal_brackets(
562 chars: &[char],
563 pos: &mut usize,
564 col: &mut usize,
565 start_line: usize,
566) -> Result<(), ParseError> {
567 debug_assert!(*pos < chars.len() && chars[*pos] == '[');
569 *pos += 1;
570 *col += 1;
571 if *pos >= chars.len() || chars[*pos] != ']' {
573 let got = chars
574 .get(*pos)
575 .map(|c| c.escape_debug().to_string())
576 .unwrap_or_else(|| "EOF".into());
577 return Err(ParseError {
578 message: format!(
579 "expected ']' after '[' in substitution list suffix, got {}",
580 got
581 ),
582 line: start_line,
583 col: *col,
584 });
585 }
586 *pos += 1;
587 *col += 1;
588 Ok(())
589}
590
591fn parse_subst_body(
594 chars: &[char],
595 pos: &mut usize,
596 col: &mut usize,
597 start_line: usize,
598 start_col: usize,
599) -> Result<SubstPayload, ParseError> {
600 let optional = if *pos < chars.len() && chars[*pos] == '?' {
604 *pos += 1;
605 *col += 1;
606 true
607 } else {
608 false
609 };
610
611 let mut cur_text = String::new();
614 let mut cur_started = false;
615 let mut cur_line = 0usize;
616 let mut cur_col = 0usize;
617
618 let mut pending_ws = String::new();
619 let mut segments: Vec<Segment> = Vec::new();
620 let mut last_dot: Option<(usize, usize)> = None;
622 let mut list_suffix = false;
624
625 loop {
626 if *pos >= chars.len() {
627 return Err(ParseError {
628 message: "unterminated substitution".into(),
629 line: start_line,
630 col: start_col,
631 });
632 }
633 let ch = chars[*pos];
634
635 match ch {
636 '}' => {
637 *pos += 1;
639 *col += 1;
640 pending_ws.clear();
642 break;
643 }
644 '"' => {
645 let q_line = start_line; let q_col = *col;
648 if cur_started {
649 cur_text.push_str(&pending_ws);
650 }
651 pending_ws.clear();
652 *pos += 1;
653 *col += 1;
654 let decoded = read_quoted_body(chars, pos, col, q_line, q_col)?;
655 cur_text.push_str(&decoded);
656 if !cur_started {
657 cur_line = q_line;
658 cur_col = q_col;
659 cur_started = true;
660 }
661 }
662 ch if is_unquoted_subst_char(ch) => {
663 if ch == '-' && !cur_started {
675 let next = chars.get(*pos + 1).copied().unwrap_or('\0');
676 if !next.is_ascii_digit() {
677 let after = if next == '\0' {
678 String::from("EOF")
679 } else {
680 format!("{:?}", next)
681 };
682 return Err(ParseError {
683 message: format!(
684 "unquoted path segment cannot begin with '-' unless followed by a digit (got '-' then {}, HOCON.md L270-276)",
685 after
686 ),
687 line: start_line,
688 col: *col,
689 });
690 }
691 }
692 let uq_col = *col;
694 if cur_started {
695 cur_text.push_str(&pending_ws);
696 }
697 pending_ws.clear();
698 if !cur_started {
699 cur_line = start_line;
700 cur_col = uq_col;
701 cur_started = true;
702 }
703 while *pos < chars.len() && is_unquoted_subst_char(chars[*pos]) {
704 cur_text.push(chars[*pos]);
705 *pos += 1;
706 *col += 1;
707 }
708 }
709 '.' => {
710 let dot_col = *col;
712 pending_ws.clear();
713 if !cur_started {
714 return Err(ParseError {
715 message: "empty segment in path".into(),
716 line: start_line,
717 col: dot_col,
718 });
719 }
720 segments.push(Segment {
721 text: std::mem::take(&mut cur_text),
722 line: cur_line,
723 col: cur_col,
724 });
725 cur_started = false;
726 cur_line = 0;
727 cur_col = 0;
728 last_dot = Some((start_line, dot_col));
729 *pos += 1;
730 *col += 1;
731 }
732 '[' => {
733 if !cur_started {
746 return Err(ParseError {
747 message: "empty segment before '[]' suffix in substitution".into(),
748 line: start_line,
749 col: *col,
750 });
751 }
752 for w in pending_ws.chars() {
753 if w != ' ' && w != '\t' {
754 return Err(ParseError {
755 message: format!(
756 "only ASCII space or tab allowed between substitution path and '[]' suffix (got {:?}, HOCON extra-spec E7)",
757 w
758 ),
759 line: start_line,
760 col: *col,
761 });
762 }
763 }
764 segments.push(Segment {
766 text: std::mem::take(&mut cur_text),
767 line: cur_line,
768 col: cur_col,
769 });
770 cur_started = false;
771 pending_ws.clear();
773 parse_literal_brackets(chars, pos, col, start_line)?;
775 list_suffix = true;
776 if *pos >= chars.len() || chars[*pos] != '}' {
778 return Err(ParseError {
779 message: "expected '}' after '[]' in substitution".into(),
780 line: start_line,
781 col: *col,
782 });
783 }
784 *pos += 1;
785 *col += 1;
786 break;
787 }
788 ch if is_hocon_whitespace(ch) && !is_hocon_newline(ch) => {
789 pending_ws.push(ch);
792 *pos += 1;
793 *col += 1;
794 }
795 '\n' => {
796 return Err(ParseError {
798 message: "unterminated substitution".into(),
799 line: start_line,
800 col: start_col,
801 });
802 }
803 other => {
804 return Err(ParseError {
805 message: format!(
806 "unexpected character in substitution path: {}",
807 other.escape_debug()
808 ),
809 line: start_line,
810 col: *col,
811 });
812 }
813 }
814 }
815
816 if cur_started {
818 segments.push(Segment {
819 text: cur_text,
820 line: cur_line,
821 col: cur_col,
822 });
823 } else if segments.is_empty() {
824 return Err(ParseError {
826 message: "empty substitution path".into(),
827 line: start_line,
828 col: start_col,
829 });
830 } else if !list_suffix {
831 let (err_line, err_col) = last_dot.unwrap_or((start_line, start_col));
834 return Err(ParseError {
835 message: "empty segment in path".into(),
836 line: err_line,
837 col: err_col,
838 });
839 }
840
841 Ok(SubstPayload {
842 segments,
843 optional,
844 list_suffix,
845 })
846}
847
848fn is_unquoted_start(ch: char) -> bool {
849 if is_hocon_whitespace(ch) {
850 return false;
851 }
852 !matches!(
853 ch,
854 '{' | '}'
855 | '['
856 | ']'
857 | ','
858 | ':'
859 | '='
860 | '+'
861 | '#'
862 | '"'
863 | '$'
864 | '?'
865 | '!'
866 | '@'
867 | '*'
868 | '&'
869 | '^'
870 | '\\'
871 )
872}
873
874fn is_unquoted_continue(ch: char, next_fn: impl Fn() -> char) -> bool {
875 if is_hocon_whitespace(ch) {
876 return false;
877 }
878 if matches!(
879 ch,
880 '{' | '}'
881 | '['
882 | ']'
883 | ','
884 | ':'
885 | '='
886 | '#'
887 | '"'
888 | '$'
889 | '?'
890 | '!'
891 | '@'
892 | '*'
893 | '&'
894 | '^'
895 | '\\'
896 ) {
897 return false;
898 }
899 if ch == '+' && next_fn() == '=' {
900 return false;
901 }
902 if ch == '/' && next_fn() == '/' {
903 return false;
904 }
905 true
906}
907
908#[cfg(test)]
909mod tests {
910 use super::*;
911
912 fn kinds(input: &str) -> Vec<TokenKind> {
913 tokenize(input)
914 .unwrap()
915 .iter()
916 .map(|t| t.kind.clone())
917 .collect()
918 }
919
920 fn first(input: &str) -> Token {
921 tokenize(input).unwrap().into_iter().next().unwrap()
922 }
923
924 #[test]
925 fn tokenizes_empty_string() {
926 let tokens = tokenize("").unwrap();
927 assert_eq!(tokens.len(), 1);
928 assert_eq!(tokens[0].kind, TokenKind::Eof);
929 }
930
931 #[test]
932 fn tokenizes_braces_and_brackets() {
933 assert_eq!(
934 kinds("{}[]"),
935 vec![
936 TokenKind::LBrace,
937 TokenKind::RBrace,
938 TokenKind::LBracket,
939 TokenKind::RBracket,
940 TokenKind::Eof
941 ]
942 );
943 }
944
945 #[test]
946 fn tokenizes_equals_and_plus_equals() {
947 let tokens = tokenize("=+=").unwrap();
948 assert_eq!(tokens[0].kind, TokenKind::Equals);
949 assert_eq!(tokens[1].kind, TokenKind::PlusEquals);
950 }
951
952 #[test]
953 fn tokenizes_colon_and_comma() {
954 assert_eq!(
955 kinds(":,"),
956 vec![TokenKind::Colon, TokenKind::Comma, TokenKind::Eof]
957 );
958 }
959
960 #[test]
961 fn skips_slash_comments_keeps_newline() {
962 let tokens = tokenize("// comment\nfoo").unwrap();
963 assert_eq!(tokens[0].kind, TokenKind::Newline);
964 assert_eq!(tokens[1].kind, TokenKind::Unquoted);
965 assert_eq!(tokens[1].value, "foo");
966 }
967
968 #[test]
969 fn skips_hash_comments() {
970 let tokens = tokenize("# comment\nfoo").unwrap();
971 assert_eq!(tokens[0].kind, TokenKind::Newline);
972 assert_eq!(tokens[1].value, "foo");
973 }
974
975 #[test]
976 fn tokenizes_quoted_strings() {
977 let t = first("\"hello world\"");
978 assert_eq!(t.kind, TokenKind::QuotedString);
979 assert_eq!(t.value, "hello world");
980 assert!(t.is_quoted);
981 }
982
983 #[test]
984 fn handles_escape_sequences() {
985 let t = first("\"a\\nb\\tc\"");
986 assert_eq!(t.value, "a\nb\tc");
987 }
988
989 #[test]
990 fn handles_unicode_escapes() {
991 let t = first("\"\\u0041\"");
992 assert_eq!(t.value, "A");
993 }
994
995 #[test]
996 fn tokenizes_triple_quoted_strings() {
997 let t = first("\"\"\"hello\nworld\"\"\"");
998 assert_eq!(t.kind, TokenKind::TripleQuotedString);
999 assert_eq!(t.value, "hello\nworld");
1000 assert!(t.is_quoted);
1001 }
1002
1003 #[test]
1004 fn strips_leading_newline_from_triple_quoted() {
1005 let t = first("\"\"\"\nhello\"\"\"");
1006 assert_eq!(t.value, "hello");
1007 }
1008
1009 #[test]
1010 fn tokenizes_unquoted_strings() {
1011 let t = first("localhost");
1012 assert_eq!(t.kind, TokenKind::Unquoted);
1013 assert_eq!(t.value, "localhost");
1014 assert!(!t.is_quoted);
1015 }
1016
1017 #[test]
1018 fn tokenizes_numbers_as_unquoted() {
1019 let t = first("8080");
1020 assert_eq!(t.kind, TokenKind::Unquoted);
1021 assert_eq!(t.value, "8080");
1022 }
1023
1024 #[test]
1025 fn tokenizes_substitutions() {
1026 let t = first("${server.host}");
1027 assert_eq!(t.kind, TokenKind::Substitution);
1028 assert_eq!(t.value, "server.host");
1029 }
1030
1031 #[test]
1032 fn tokenizes_optional_substitutions() {
1033 let t = first("${?foo}");
1034 assert_eq!(t.kind, TokenKind::Substitution);
1035 assert_eq!(t.value, "foo");
1036 assert!(t.subst.as_ref().unwrap().optional);
1037 }
1038
1039 #[test]
1040 fn tokenizes_newlines() {
1041 let tokens = tokenize("a\nb").unwrap();
1042 assert_eq!(tokens[1].kind, TokenKind::Newline);
1043 }
1044
1045 #[test]
1046 fn deduplicates_consecutive_newlines() {
1047 let tokens = tokenize("a\n\n\nb").unwrap();
1048 let newlines: Vec<_> = tokens
1049 .iter()
1050 .filter(|t| t.kind == TokenKind::Newline)
1051 .collect();
1052 assert_eq!(newlines.len(), 1);
1053 }
1054
1055 #[test]
1056 fn tracks_line_and_col() {
1057 let tokens = tokenize("a\nb").unwrap();
1058 assert_eq!(tokens[0].line, 1);
1059 assert_eq!(tokens[0].col, 1);
1060 assert_eq!(tokens[2].line, 2);
1061 assert_eq!(tokens[2].col, 1);
1062 }
1063
1064 #[test]
1065 fn sets_preceding_space() {
1066 let tokens = tokenize("a b").unwrap();
1067 assert!(tokens[1].preceding_space);
1068 assert!(!tokens[0].preceding_space);
1069 }
1070
1071 #[test]
1072 fn strips_utf8_bom() {
1073 let tokens = tokenize("\u{FEFF}foo").unwrap();
1074 assert_eq!(tokens[0].value, "foo");
1075 }
1076
1077 #[test]
1078 fn stops_unquoted_at_dollar_for_concat() {
1079 let tokens = tokenize("foo${bar}").unwrap();
1080 assert_eq!(tokens[0].kind, TokenKind::Unquoted);
1081 assert_eq!(tokens[0].value, "foo");
1082 assert_eq!(tokens[1].kind, TokenKind::Substitution);
1083 assert_eq!(tokens[1].value, "bar");
1084 assert!(!tokens[1].preceding_space);
1085 }
1086
1087 #[test]
1088 fn throws_on_unterminated_string() {
1089 assert!(tokenize("\"unterminated").is_err());
1090 }
1091
1092 #[test]
1093 fn throws_on_unterminated_substitution() {
1094 assert!(tokenize("${foo").is_err());
1095 }
1096
1097 #[test]
1098 fn throws_on_unterminated_triple_quoted_string() {
1099 assert!(tokenize(r#""""unterminated"#).is_err());
1100 }
1101
1102 #[test]
1120 fn s2_3_comment_markers_inside_quoted_string_are_literal() {
1121 let tokens = tokenize(r#""http://example.com""#).unwrap();
1123 assert_eq!(tokens[0].kind, TokenKind::QuotedString);
1124 assert_eq!(tokens[0].value, "http://example.com");
1125
1126 let tokens = tokenize("\"# not a comment\"").unwrap();
1128 assert_eq!(tokens[0].kind, TokenKind::QuotedString);
1129 assert_eq!(tokens[0].value, "# not a comment");
1130 }
1131
1132 #[test]
1139 fn s6_1_em_space_separates_tokens_spec() {
1140 let tokens = tokenize("a\u{2003}b").unwrap();
1141 let unquoted: Vec<_> = tokens
1142 .iter()
1143 .filter(|t| t.kind == TokenKind::Unquoted)
1144 .collect();
1145 assert_eq!(unquoted.len(), 2, "em space should separate two tokens");
1146 assert_eq!(unquoted[0].value, "a");
1147 assert_eq!(unquoted[1].value, "b");
1148 }
1149
1150 #[test]
1152 fn s6_1_line_separator_separates_tokens_spec() {
1153 let tokens = tokenize("a\u{2028}b").unwrap();
1154 let unquoted: Vec<_> = tokens
1155 .iter()
1156 .filter(|t| t.kind == TokenKind::Unquoted)
1157 .collect();
1158 assert_eq!(unquoted.len(), 2, "U+2028 (Zl) should separate two tokens");
1159 assert_eq!(unquoted[0].value, "a");
1160 assert_eq!(unquoted[1].value, "b");
1161 }
1162
1163 #[test]
1169 fn s6_2_nbsp_separates_tokens_spec() {
1170 let tokens = tokenize("a\u{00A0}b").unwrap();
1171 let unquoted: Vec<_> = tokens
1172 .iter()
1173 .filter(|t| t.kind == TokenKind::Unquoted)
1174 .collect();
1175 assert_eq!(unquoted.len(), 2, "NBSP should separate two tokens");
1176 assert_eq!(unquoted[0].value, "a");
1177 assert_eq!(unquoted[1].value, "b");
1178 }
1179
1180 #[test]
1182 fn s6_2_figure_space_separates_tokens_spec() {
1183 let tokens = tokenize("a\u{2007}b").unwrap();
1184 let unquoted: Vec<_> = tokens
1185 .iter()
1186 .filter(|t| t.kind == TokenKind::Unquoted)
1187 .collect();
1188 assert_eq!(unquoted.len(), 2, "figure space should separate two tokens");
1189 assert_eq!(unquoted[0].value, "a");
1190 assert_eq!(unquoted[1].value, "b");
1191 }
1192
1193 #[test]
1195 fn s6_2_narrow_nbsp_separates_tokens_spec() {
1196 let tokens = tokenize("a\u{202F}b").unwrap();
1197 let unquoted: Vec<_> = tokens
1198 .iter()
1199 .filter(|t| t.kind == TokenKind::Unquoted)
1200 .collect();
1201 assert_eq!(unquoted.len(), 2, "narrow NBSP should separate two tokens");
1202 assert_eq!(unquoted[0].value, "a");
1203 assert_eq!(unquoted[1].value, "b");
1204 }
1205
1206 #[test]
1212 fn s6_4_tab_is_whitespace() {
1213 let tokens = tokenize("a\tb").unwrap();
1215 let unquoted: Vec<_> = tokens
1216 .iter()
1217 .filter(|t| t.kind == TokenKind::Unquoted)
1218 .collect();
1219 assert_eq!(unquoted.len(), 2);
1220 assert_eq!(unquoted[0].value, "a");
1221 assert_eq!(unquoted[1].value, "b");
1222 }
1223
1224 #[test]
1225 fn s6_4_cr_is_whitespace() {
1226 let tokens = tokenize("a\rb").unwrap();
1229 let unquoted: Vec<_> = tokens
1230 .iter()
1231 .filter(|t| t.kind == TokenKind::Unquoted)
1232 .collect();
1233 assert_eq!(unquoted.len(), 2);
1234 assert_eq!(unquoted[0].value, "a");
1235 assert_eq!(unquoted[1].value, "b");
1236 }
1237
1238 #[test]
1240 fn s6_4_vtab_is_whitespace_spec() {
1241 let tokens = tokenize("a\x0Bb").unwrap();
1242 let unquoted: Vec<_> = tokens
1243 .iter()
1244 .filter(|t| t.kind == TokenKind::Unquoted)
1245 .collect();
1246 assert_eq!(unquoted.len(), 2, "vtab should separate tokens");
1247 assert_eq!(unquoted[0].value, "a");
1248 assert_eq!(unquoted[1].value, "b");
1249 }
1250
1251 #[test]
1253 fn s6_4_ff_is_whitespace_spec() {
1254 let tokens = tokenize("a\x0Cb").unwrap();
1255 let unquoted: Vec<_> = tokens
1256 .iter()
1257 .filter(|t| t.kind == TokenKind::Unquoted)
1258 .collect();
1259 assert_eq!(unquoted.len(), 2, "FF should separate tokens");
1260 assert_eq!(unquoted[0].value, "a");
1261 assert_eq!(unquoted[1].value, "b");
1262 }
1263
1264 #[test]
1268 fn s6_4_fs_gs_rs_us_are_whitespace_spec() {
1269 for (label, ch) in [
1270 ("FS (0x1C)", '\x1C'),
1271 ("GS (0x1D)", '\x1D'),
1272 ("RS (0x1E)", '\x1E'),
1273 ("US (0x1F)", '\x1F'),
1274 ] {
1275 let input = format!("a{}b", ch);
1276 let tokens = tokenize(&input).unwrap();
1277 let unquoted: Vec<_> = tokens
1278 .iter()
1279 .filter(|t| t.kind == TokenKind::Unquoted)
1280 .collect();
1281 assert_eq!(unquoted.len(), 2, "{label} should separate tokens");
1282 assert_eq!(unquoted[0].value, "a", "{label}");
1283 assert_eq!(unquoted[1].value, "b", "{label}");
1284 }
1285 }
1286
1287 #[test]
1292 fn s6_lf_still_emits_newline_token() {
1293 let tokens = tokenize("a\nb").unwrap();
1294 assert!(
1295 tokens.iter().any(|t| matches!(t.kind, TokenKind::Newline)),
1296 "LF must still emit a Newline token after whitespace predicate centralization"
1297 );
1298 }
1299
1300 #[test]
1308 fn s6_3_bom_midstream_is_whitespace() {
1309 let tokens = tokenize("a\u{FEFF}b").unwrap();
1310 let unquoted: Vec<_> = tokens
1311 .iter()
1312 .filter(|t| t.kind == TokenKind::Unquoted)
1313 .collect();
1314 assert_eq!(
1315 unquoted.len(),
1316 2,
1317 "BOM mid-stream should separate two tokens"
1318 );
1319 assert_eq!(unquoted[0].value, "a");
1320 assert_eq!(unquoted[1].value, "b");
1321 }
1322
1323 #[test]
1338 fn e8_value_start_digit_leading_with_letters_is_string() {
1339 let cfg = crate::parse("x = 123abc").expect("parse failed");
1345 assert_eq!(
1346 cfg.get_string("x").expect("x not found"),
1347 "123abc",
1348 "E8: `123abc` must lex+resolve as unquoted string \"123abc\""
1349 );
1350 }
1351
1352 #[test]
1353 fn e8_value_start_hyphen_leading_non_number_is_string() {
1354 let cfg = crate::parse("x = -foo").expect("parse failed");
1361 assert_eq!(
1362 cfg.get_string("x").expect("x not found"),
1363 "-foo",
1364 "E8: `-foo` must lex+resolve as unquoted string \"-foo\""
1365 );
1366 }
1367
1368 #[test]
1374 fn s8_7_backslash_is_rejected_in_unquoted_context() {
1375 assert!(
1379 tokenize(r"a\n").is_err(),
1380 "bare backslash outside quotes must be rejected"
1381 );
1382 }
1383
1384 #[test]
1389 fn s8_8_soh_allowed_in_unquoted_string() {
1390 let tokens = tokenize("foo\x01bar").unwrap();
1392 let unquoted: Vec<_> = tokens
1393 .iter()
1394 .filter(|t| t.kind == TokenKind::Unquoted)
1395 .collect();
1396 assert_eq!(unquoted.len(), 1);
1397 assert_eq!(unquoted[0].value, "foo\x01bar");
1398 }
1399
1400 #[test]
1401 fn s8_8_bel_allowed_in_unquoted_string() {
1402 let tokens = tokenize("foo\x07bar").unwrap();
1404 let unquoted: Vec<_> = tokens
1405 .iter()
1406 .filter(|t| t.kind == TokenKind::Unquoted)
1407 .collect();
1408 assert_eq!(unquoted.len(), 1);
1409 assert_eq!(unquoted[0].value, "foo\x07bar");
1410 }
1411}