1use crate::error::FormatParseError;
4use crate::types::definitions::{FieldSpec, FieldType};
5use regex;
6use std::collections::HashMap;
7
8pub const MAX_NESTED_FORMAT_DEPTH: usize = 10;
10
11const MAX_BRACE_DEPTH_IN_FORMAT_SPEC: i32 = 10;
13
14pub type ParsedPatternParts = (
18 String,
19 String,
20 Vec<FieldSpec>,
21 Vec<Option<String>>,
22 Vec<Option<String>>,
23 HashMap<String, String>,
24 bool,
25);
26
27fn literal_delimits_empty_field(s: &str) -> bool {
29 !s.trim().is_empty()
30}
31
32fn collect_balanced_format_spec(
35 chars: &mut std::iter::Peekable<std::str::Chars>,
36) -> Result<String, FormatParseError> {
37 let mut out = String::new();
38 let mut depth = 0i32;
39 loop {
40 let Some(&ch) = chars.peek() else {
41 return Err(FormatParseError::PatternError(
42 "Unclosed '{' in pattern: expected '}' to close the field".to_string(),
43 ));
44 };
45 if ch == '}' && depth == 0 {
46 break;
47 }
48 let c = chars
49 .next()
50 .expect("peek matched a char so next() must succeed");
51 match c {
52 '{' => {
53 if chars.peek() == Some(&'{') {
54 chars.next();
55 out.push('{');
56 out.push('{');
57 } else {
58 depth += 1;
59 if depth > MAX_BRACE_DEPTH_IN_FORMAT_SPEC {
60 return Err(FormatParseError::PatternError(
61 "Format specification has too many nested '{' (max 10)".to_string(),
62 ));
63 }
64 out.push('{');
65 }
66 }
67 '}' => {
71 depth -= 1;
72 if depth < 0 {
73 return Err(FormatParseError::PatternError(
74 "Unexpected '}' in format specification".to_string(),
75 ));
76 }
77 out.push('}');
78 }
79 _ => out.push(c),
80 }
81 }
82 Ok(out)
83}
84
85fn brace_balance_valid_for_nested_candidate(s: &str) -> bool {
86 let mut depth = 0i32;
87 let mut it = s.chars().peekable();
88 while let Some(c) = it.next() {
89 match c {
90 '{' => {
91 if it.peek() == Some(&'{') {
92 it.next();
93 continue;
94 }
95 depth += 1;
96 }
97 '}' => {
98 depth -= 1;
99 if depth < 0 {
100 return false;
101 }
102 }
103 _ => {}
104 }
105 }
106 depth == 0
107}
108
109fn is_nested_format_spec_candidate(trimmed: &str) -> bool {
112 if trimmed.len() < 2 {
113 return false;
114 }
115 if !trimmed.starts_with('{') || trimmed.starts_with("{{") {
116 return false;
117 }
118 if !trimmed.ends_with('}') {
119 return false;
120 }
121 brace_balance_valid_for_nested_candidate(trimmed)
122}
123
124fn strip_regex_anchors(anchored: &str) -> String {
126 let s = anchored.strip_prefix('^').unwrap_or(anchored);
127 let s = s.strip_suffix('$').unwrap_or(s);
128 s.to_string()
129}
130
131fn has_trailing_literal_before_next_field(mut chars: std::iter::Peekable<std::str::Chars>) -> bool {
136 while chars.peek().is_some_and(|c| c.is_whitespace()) {
137 chars.next();
138 }
139 if chars.next() != Some('}') {
140 return false;
141 }
142 while chars.peek().is_some_and(|c| c.is_whitespace()) {
143 chars.next();
144 }
145 let mut literal = String::new();
146 loop {
147 match chars.next() {
148 None => return literal_delimits_empty_field(&literal),
149 Some('{') => {
150 if chars.peek() == Some(&'{') {
151 chars.next();
152 literal.push('{');
153 } else {
154 return literal_delimits_empty_field(&literal);
155 }
156 }
157 Some('}') => {
158 if chars.peek() == Some(&'}') {
159 chars.next();
160 literal.push('}');
161 } else {
162 literal.push('}');
163 }
164 }
165 Some(c) => literal.push(c),
166 }
167 }
168}
169
170pub fn parse_pattern(
174 pattern: &str,
175 custom_patterns: &HashMap<String, String>,
176 allow_empty_delimited_default_string: bool,
177 nesting_depth: usize,
178) -> Result<ParsedPatternParts, FormatParseError> {
179 let estimated_fields = pattern.matches('{').count();
181 let mut regex_parts = Vec::with_capacity(estimated_fields * 2);
182 let mut field_specs = Vec::with_capacity(estimated_fields);
183 let mut field_names = Vec::with_capacity(estimated_fields); let mut normalized_names = Vec::with_capacity(estimated_fields); let mut name_mapping = HashMap::with_capacity(estimated_fields); let mut field_name_types = HashMap::with_capacity(estimated_fields); let mut chars: std::iter::Peekable<std::str::Chars> = pattern.chars().peekable();
188 let mut literal = String::new();
189 let mut allows_empty_default_string_match = true;
190
191 while let Some(ch) = chars.next() {
192 match ch {
193 '{' => {
194 if chars.peek() == Some(&'{') {
196 chars.next();
197 literal.push('{');
198 continue;
199 }
200
201 let had_leading_literal = !literal.trim().is_empty();
202
203 if !literal.is_empty() {
205 allows_empty_default_string_match = false;
206 let escaped = if literal.trim_end() != literal {
209 let trimmed = literal.trim_end();
212 let mut escaped_str = String::with_capacity(trimmed.len() + 4);
213 escaped_str.push_str(®ex::escape(trimmed));
214 escaped_str.push_str("\\s+");
215 escaped_str
216 } else {
217 regex::escape(&literal)
218 };
219 regex_parts.push(escaped);
220 literal.clear();
221 }
222
223 let (mut spec, name) = parse_field(&mut chars, nesting_depth)?;
225
226 if matches!(spec.field_type, FieldType::Nested) {
227 if nesting_depth >= MAX_NESTED_FORMAT_DEPTH {
228 return Err(FormatParseError::PatternError(
229 "Nested format patterns exceed max depth (10)".to_string(),
230 ));
231 }
232 let inner = spec.nested_subpattern.as_ref().ok_or_else(|| {
233 FormatParseError::PatternError(
234 "Internal error: nested field missing subpattern".to_string(),
235 )
236 })?;
237 let (inner_anchored, _, _, _, _, _, _) = parse_pattern(
238 inner,
239 custom_patterns,
240 allow_empty_delimited_default_string,
241 nesting_depth + 1,
242 )?;
243 spec.nested_regex_body = Some(strip_regex_anchors(&inner_anchored));
244 }
245
246 if !spec.is_default_unconstrained_string() {
247 allows_empty_default_string_match = false;
248 }
249
250 let has_trailing_literal = has_trailing_literal_before_next_field(chars.clone());
251
252 let mut peek_chars = chars.clone();
255 let next_field_is_greedy = loop {
256 let mut found_closing = false;
258 while let Some(&ch) = peek_chars.peek() {
259 if ch.is_whitespace() {
260 peek_chars.next();
261 } else if ch == '}' {
262 peek_chars.next(); found_closing = true;
264 break;
265 } else {
266 break;
267 }
268 }
269 if !found_closing {
270 break None; }
272 while let Some(&ch) = peek_chars.peek() {
274 if ch.is_whitespace() {
275 peek_chars.next();
276 } else {
277 break;
278 }
279 }
280 if peek_chars.peek() == Some(&'{') {
282 peek_chars.next();
283 if peek_chars.peek() == Some(&'{') {
285 peek_chars.next();
286 continue; }
288 if peek_chars.peek() == Some(&'}') {
290 break Some(false);
292 } else {
293 let mut field_chars = peek_chars.clone();
295 let mut has_precision = false;
296 while let Some(&ch) = field_chars.peek() {
297 if ch == '}' {
298 break;
299 }
300 if ch == ':' {
301 field_chars.next();
302 while let Some(&next_ch) = field_chars.peek() {
304 if next_ch == '}' {
305 break;
306 }
307 if next_ch == '.' {
308 has_precision = true;
309 break;
310 }
311 field_chars.next();
312 }
313 break;
314 }
315 field_chars.next();
316 }
317 break Some(has_precision);
320 }
321 } else {
322 break None;
324 }
325 };
326
327 let allow_empty_delimited = allow_empty_delimited_default_string
328 && spec.is_default_unconstrained_string()
329 && (had_leading_literal || has_trailing_literal);
330 let pattern = spec.to_regex_pattern(
331 custom_patterns,
332 next_field_is_greedy,
333 allow_empty_delimited,
334 );
335 let la_raw = spec.regex_lookahead.as_deref().unwrap_or("");
336 let (lb_prefix, body, la_emit) =
337 crate::rewrite_field_fragments_for_engine_anchor(&pattern, la_raw);
338
339 if let Some(ref original_name) = name {
341 if let Some(existing_type) = field_name_types.get(original_name) {
342 if !field_types_match(existing_type, &spec.field_type) {
344 return Err(FormatParseError::RepeatedNameError(original_name.clone()));
345 }
346 } else {
347 field_name_types.insert(original_name.clone(), spec.field_type.clone());
348 }
349 }
350
351 let group_pattern = if matches!(spec.field_type, FieldType::BracedContent) {
355 let Some(ref original_name) = name else {
356 return Err(FormatParseError::PatternError(
357 "The :brace format requires a named field (e.g. {content:brace})"
358 .to_string(),
359 ));
360 };
361 if original_name.chars().all(|c| c.is_ascii_digit()) {
362 return Err(FormatParseError::PatternError(
363 "The :brace format cannot be used with numbered fields".to_string(),
364 ));
365 }
366 let normalized =
367 normalize_field_name(original_name, &mut name_mapping, &normalized_names);
368 format!("\\{{(?P<{}>.*?)\\}}", normalized)
369 } else if let Some(ref original_name) = name {
370 let is_numeric = original_name.chars().all(|c| c.is_ascii_digit());
372
373 if is_numeric {
374 format!("{}{}({}){}", lb_prefix, "", body, la_emit)
376 } else {
377 let normalized = normalize_field_name(
379 original_name,
380 &mut name_mapping,
381 &normalized_names,
382 );
383 format!("{}{}(?P<{}>{}){}", lb_prefix, "", normalized, body, la_emit)
384 }
385 } else {
386 format!("{}{}({}){}", lb_prefix, "", body, la_emit)
387 };
388
389 regex_parts.push(group_pattern);
390
391 if let Some(ref original_name) = name {
393 let is_numeric = original_name.chars().all(|c| c.is_ascii_digit());
395
396 if is_numeric {
397 field_names.push(None); normalized_names.push(None);
399 } else {
400 let normalized = normalize_field_name(
401 original_name,
402 &mut name_mapping,
403 &normalized_names,
404 );
405 field_names.push(Some(original_name.clone())); normalized_names.push(Some(normalized.clone())); name_mapping.insert(normalized, original_name.clone()); }
409 } else {
410 field_names.push(None);
411 normalized_names.push(None);
412 }
413 field_specs.push(spec);
414
415 if chars.next() != Some('}') {
417 return Err(FormatParseError::PatternError(
418 "Expected '}' after field specification".to_string(),
419 ));
420 }
421 }
422 '}' => {
423 if chars.peek() == Some(&'}') {
425 chars.next();
426 literal.push('}');
427 continue;
428 }
429 literal.push('}');
430 }
431 _ => {
432 literal.push(ch);
433 }
434 }
435 }
436
437 if !literal.is_empty() {
439 allows_empty_default_string_match = false;
440 let escaped = if literal.trim_end() != literal {
442 let trimmed = literal.trim_end();
445 format!("{}\\s*", regex::escape(trimmed))
446 } else {
447 regex::escape(&literal)
448 };
449 regex_parts.push(escaped);
450 }
451
452 let regex_str = regex_parts.join("");
453 let regex_str_with_anchors = format!("^{}$", regex_str);
454 Ok((
455 regex_str_with_anchors,
456 regex_str,
457 field_specs,
458 field_names,
459 normalized_names,
460 name_mapping,
461 allows_empty_default_string_match,
462 ))
463}
464
465pub fn normalize_field_name(
472 name: &str,
473 _name_mapping: &mut HashMap<String, String>,
474 existing_normalized: &[Option<String>],
475) -> String {
476 let mut base_normalized = String::with_capacity(name.len());
477 for c in name.chars() {
478 match c {
479 '-' | '.' | '[' => base_normalized.push('_'),
480 ']' => {}
481 _ => base_normalized.push(c),
482 }
483 }
484
485 let mut normalized = base_normalized.clone();
487
488 let underscore_pos = normalized.find('_');
490
491 let mut collision_count = 0;
493 while existing_normalized
494 .iter()
495 .any(|n| n.as_ref().map(|s| s == &normalized).unwrap_or(false))
496 {
497 collision_count += 1;
498 if let Some(pos) = underscore_pos {
501 let before = &base_normalized[..pos];
502 let after = &base_normalized[pos + 1..];
503 normalized = format!("{}{}{}", before, "_".repeat(1 + collision_count), after);
505 } else {
506 normalized = format!("{}{}", base_normalized, "_".repeat(collision_count));
508 }
509 }
510
511 normalized
512}
513
514pub fn validate_multiline_mvp(spec: &FieldSpec) -> Result<(), FormatParseError> {
519 if !matches!(
520 spec.field_type,
521 FieldType::Multiline | FieldType::IndentBlock
522 ) {
523 return Ok(());
524 }
525 if spec.sign.is_some() || spec.zero_pad {
526 return Err(FormatParseError::PatternError(
527 "Multiline types :ml and :blk do not support sign or zero-padding".to_string(),
528 ));
529 }
530 if spec.alignment == Some('=') {
531 return Err(FormatParseError::PatternError(
532 "Multiline types :ml and :blk do not support '=' alignment".to_string(),
533 ));
534 }
535 Ok(())
536}
537
538pub fn field_types_match(t1: &FieldType, t2: &FieldType) -> bool {
540 use std::mem::discriminant;
541 discriminant(t1) == discriminant(t2)
542}
543
544pub fn parse_field_path(field_name: &str) -> Vec<String> {
546 let mut path = Vec::new();
547 let mut current = String::new();
548 let mut in_brackets = false;
549
550 for ch in field_name.chars() {
551 match ch {
552 '[' => {
553 if !current.is_empty() {
554 path.push(current.clone());
555 current.clear();
556 }
557 in_brackets = true;
558 }
559 ']' => {
560 if in_brackets {
561 if !current.is_empty() {
562 path.push(current.clone());
563 current.clear();
564 }
565 in_brackets = false;
566 } else {
567 current.push(ch);
568 }
569 }
570 _ => {
571 current.push(ch);
572 }
573 }
574 }
575
576 if !current.is_empty() {
577 path.push(current);
578 }
579
580 path
581}
582
583pub fn parse_field(
585 chars: &mut std::iter::Peekable<std::str::Chars>,
586 nesting_depth: usize,
587) -> Result<(FieldSpec, Option<String>), FormatParseError> {
588 let mut spec = FieldSpec::new();
589 let mut field_name = String::new();
590 let mut in_name = true;
591
592 let mut in_brackets = false;
594 while let Some(&ch) = chars.peek() {
595 match ch {
596 ':' => {
597 chars.next();
598 in_name = false;
599 break;
600 }
601 '!' => {
602 chars.next();
603 if chars.peek().is_some() {
605 chars.next();
606 }
607 in_name = false;
608 }
609 '}' => {
610 break;
611 }
612 '[' => {
613 in_brackets = true;
614 field_name.push(ch);
615 chars.next();
616 }
617 ']' => {
618 in_brackets = false;
619 field_name.push(ch);
620 chars.next();
621 }
622 '\'' | '"' => {
623 if in_brackets {
625 return Err(FormatParseError::NotImplementedError(
626 "Quoted keys in field names".to_string(),
627 ));
628 }
629 in_name = false;
631 break;
632 }
633 _ => {
634 if ch.is_alphanumeric() || ch == '_' || ch == '-' || ch == '.' {
636 field_name.push(ch);
637 chars.next();
638 } else {
639 in_name = false;
641 break;
642 }
643 }
644 }
645 }
646
647 if !in_name {
649 let format_spec = collect_balanced_format_spec(chars)?;
650 let trimmed = format_spec.trim();
651 if is_nested_format_spec_candidate(trimmed) {
652 if nesting_depth >= MAX_NESTED_FORMAT_DEPTH {
653 return Err(FormatParseError::PatternError(
654 "Nested format patterns exceed max depth (10)".to_string(),
655 ));
656 }
657 spec.field_type = FieldType::Nested;
658 spec.nested_subpattern = Some(trimmed.to_string());
659 } else {
660 parse_format_spec(&format_spec, &mut spec)?;
661 }
662 validate_multiline_mvp(&spec)?;
663 }
664
665 let name = if field_name.is_empty() {
666 None
667 } else {
668 Some(field_name)
669 };
670
671 Ok((spec, name))
672}
673
674pub fn parse_format_spec(format_spec: &str, spec: &mut FieldSpec) -> Result<(), FormatParseError> {
676 let mut chars = format_spec.chars().peekable();
680
681 if let Some(&ch) = chars.peek() {
684 if ch == '<' || ch == '>' || ch == '^' || ch == '=' {
685 spec.alignment = Some(ch);
686 chars.next();
687 } else {
688 let mut peek_iter = chars.clone();
690 peek_iter.next(); if let Some(next_ch) = peek_iter.next() {
692 if next_ch == '<' || next_ch == '>' || next_ch == '^' || next_ch == '=' {
693 spec.fill = Some(ch);
694 chars.next(); spec.alignment = Some(next_ch);
696 chars.next(); }
698 }
699 }
700 }
701
702 if let Some(&ch) = chars.peek() {
704 if ch == '+' || ch == '-' || ch == ' ' {
705 spec.sign = Some(ch);
706 chars.next();
707 }
708 }
709
710 if chars.peek() == Some(&'#') {
712 chars.next();
713 }
714
715 if chars.peek() == Some(&'0') {
717 spec.zero_pad = true;
718 chars.next();
719 }
720
721 let mut width_str = String::new();
723 while let Some(&ch) = chars.peek() {
724 if ch.is_ascii_digit() {
725 width_str.push(ch);
726 chars.next();
727 } else {
728 break;
729 }
730 }
731 if !width_str.is_empty() {
732 spec.width = width_str.parse::<usize>().ok();
733 }
734
735 if chars.peek() == Some(&',') {
737 chars.next();
738 }
739
740 if chars.peek() == Some(&'.') {
742 chars.next();
743 let mut precision_str = String::new();
744 while let Some(&ch) = chars.peek() {
745 if ch.is_ascii_digit() {
746 precision_str.push(ch);
747 chars.next();
748 } else {
749 break;
750 }
751 }
752 if !precision_str.is_empty() {
753 spec.precision = precision_str.parse::<usize>().ok();
754 }
755 }
756
757 let mut type_str = String::new();
759 for ch in chars {
760 type_str.push(ch);
761 }
762
763 if type_str == "%" {
764 spec.field_type = FieldType::Percentage;
765 return Ok(());
766 }
767 if type_str.starts_with('%') {
768 crate::reject_lookaround_in_strftime(&type_str).map_err(FormatParseError::PatternError)?;
769 spec.field_type = FieldType::DateTimeStrftime;
770 spec.strftime_format = Some(type_str.clone());
771 return Ok(());
772 }
773
774 let (type_base, lookaround_tail) = crate::split_type_base_and_lookaround_tail(&type_str);
775 if type_base.is_empty() && !lookaround_tail.is_empty() {
776 return Err(FormatParseError::PatternError(
777 "Type specification must precede lookaround assertions".to_string(),
778 ));
779 }
780
781 let type_name: String = type_base.chars().filter(|c| c.is_alphabetic()).collect();
783
784 spec.field_type = if type_name.is_empty() {
785 FieldType::String
786 } else if type_name == "ti" {
787 FieldType::DateTimeISO
788 } else if type_name == "te" {
789 FieldType::DateTimeRFC2822
790 } else if type_name == "tg" {
791 FieldType::DateTimeGlobal
792 } else if type_name == "ta" {
793 FieldType::DateTimeUS
794 } else if type_name == "tc" {
795 FieldType::DateTimeCtime
796 } else if type_name == "th" {
797 FieldType::DateTimeHTTP
798 } else if type_name == "tt" {
799 FieldType::DateTimeTime
800 } else if type_name == "ts" {
801 FieldType::DateTimeSystem
802 } else if type_name == "brace" {
803 FieldType::BracedContent
804 } else if type_name == "ml" {
805 FieldType::Multiline
806 } else if type_name == "blk" {
807 FieldType::IndentBlock
808 } else if type_name.len() > 1 {
809 FieldType::Custom(type_name)
810 } else {
811 let type_char = type_name.chars().next().unwrap();
812 spec.original_type_char = Some(type_char);
813 match type_char {
814 's' => FieldType::String,
815 'd' | 'i' => FieldType::Integer,
816 'b' | 'o' | 'x' | 'X' => FieldType::Integer,
817 'n' => FieldType::NumberWithThousands,
818 'f' | 'F' => FieldType::Float,
819 'e' | 'E' => FieldType::Scientific,
820 'g' | 'G' => FieldType::GeneralNumber,
821 'l' => FieldType::Letters,
822 'w' => FieldType::Word,
823 'W' => FieldType::NonLetters,
824 'S' => FieldType::NonWhitespace,
825 'D' => FieldType::NonDigits,
826 c => FieldType::Custom(c.to_string()),
827 }
828 };
829
830 if !lookaround_tail.is_empty() {
831 let (lb, la) = crate::parse_lookaround_tail(lookaround_tail)
832 .map_err(FormatParseError::PatternError)?;
833 match &spec.field_type {
834 FieldType::Integer | FieldType::Float => {
835 spec.regex_lookbehind = if lb.is_empty() { None } else { Some(lb) };
836 spec.regex_lookahead = if la.is_empty() { None } else { Some(la) };
837 }
838 _ => {
839 return Err(FormatParseError::PatternError("Lookaround assertions are only supported for integer and float format types (d, i, b, o, x, X, f, F)".to_string()));
840 }
841 }
842 }
843
844 Ok(())
845}
846
847#[cfg(test)]
848mod normalize_field_name_tests {
849 use super::normalize_field_name;
850 use std::collections::HashMap;
851
852 #[test]
853 fn dict_style_brackets_map_to_underscores() {
854 let mut m = HashMap::new();
855 let existing: Vec<Option<String>> = vec![];
856 assert_eq!(
857 normalize_field_name("hello[world]", &mut m, &existing),
858 "hello_world"
859 );
860 assert_eq!(
861 normalize_field_name("hello[foo][baz]", &mut m, &existing),
862 "hello_foo_baz"
863 );
864 }
865
866 #[test]
867 fn deep_nested_brackets_normalize() {
868 let mut m = HashMap::new();
869 assert_eq!(normalize_field_name("a[b[c[d]]]", &mut m, &[]), "a_b_c_d");
870 }
871}